├── c ├── python ├── examples │ ├── bad_pyspark.py.fail │ ├── __init__.py │ ├── spark_expectations_example.py.fail │ ├── test_load_previous_run_data.py │ ├── dual_write.py │ ├── test_dual_write.py │ ├── load_previous_run_data.py │ ├── test_dual_write_new.py │ ├── pandera_ex.py │ ├── SQLLineage.py │ ├── udf.py │ ├── spark_expectations_example.py │ ├── bad_pyspark.py │ └── simple_perf.py ├── .flake8 ├── README.md ├── pyproject.toml ├── requirements.txt ├── setup.cfg └── tox.ini ├── project ├── build.properties └── plugins.sbt ├── sql ├── iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail ├── partioned_table_join.sql.conf ├── nonpartitioned_table_join.sql.conf ├── nonpartitioned_table_join.sql ├── gluten_only_nonpartitioned_table_join.sql ├── partioned_table_join.sql ├── iceberg-schema-evolution-gotcha-possibility.sql ├── iceberg-schema-evolution-gotcha-workaround.sql └── wap.sql ├── .jvmopts ├── native └── src │ ├── c │ ├── sum.h │ ├── sum.c │ ├── sumf_wrapper.c │ ├── sum_wrapper.c │ ├── include │ │ └── com_highperformancespark_examples_ffi_SumJNI.h │ └── gluten │ │ └── GlutenUDF.cpp │ ├── fortran │ └── sumf.f95 │ └── CMakeLists.txt ├── resources ├── rawpanda.json └── mysql-connector-java-5.1.38.jar ├── data └── project.csv ├── core └── src │ ├── main │ ├── julia │ │ ├── setup.jl │ │ └── wc.jl │ ├── perl │ │ ├── Changes │ │ ├── MANIFEST │ │ ├── ignore.txt │ │ ├── t │ │ │ ├── 00-load.t │ │ │ ├── manifest.t │ │ │ ├── pod.t │ │ │ └── pod-coverage.t │ │ ├── ghinfo.pl │ │ ├── Makefile.PL │ │ ├── xt │ │ │ └── boilerplate.t │ │ ├── README │ │ └── lib │ │ │ └── HighPerformanceSpark │ │ │ └── Examples.pm │ ├── java │ │ └── com │ │ │ └── highperformancespark │ │ │ └── examples │ │ │ ├── ffi │ │ │ └── SumJNIJava.java │ │ │ ├── objects │ │ │ ├── JavaCoffeeShop.java │ │ │ ├── JavaPandaPlace.java │ │ │ ├── JavaPandas.java │ │ │ ├── JavaPandaInfo.java │ │ │ └── JavaRawPanda.java │ │ │ ├── WordCount.java │ │ │ ├── JavaInterop.java │ │ │ └── dataframe │ │ │ ├── JavaUDFs.java │ │ │ └── JavaLoadSave.java │ ├── scala │ │ └── com │ │ │ └── high-performance-spark-examples │ │ │ ├── native │ │ │ ├── SumJNA.scala │ │ │ ├── NativeExample.scala │ │ │ ├── SumJNI.scala │ │ │ ├── StandAlone.scala │ │ │ ├── SumFJNA.scala │ │ │ └── PipeExample.scala │ │ │ ├── ml │ │ │ ├── SimpleExport.scala │ │ │ └── CustomPipeline.scala │ │ │ ├── tokenize │ │ │ └── SampleTokenize.scala │ │ │ ├── dataframe │ │ │ ├── SQLExtension.scala │ │ │ ├── RegularSQL.scala │ │ │ ├── NullabilityFilterOptimizer.scala │ │ │ ├── RawPandas.scala │ │ │ ├── UDFs.scala │ │ │ ├── MixedDataset.scala_back │ │ │ ├── MixedDataset.scala │ │ │ └── LoadSave.scala │ │ │ ├── streaming │ │ │ ├── Structured.scala │ │ │ └── DStream.scala │ │ │ ├── transformations │ │ │ ├── NarrowAndWide.scala │ │ │ └── Accumulators.scala │ │ │ ├── tools │ │ │ ├── ResourceProfileEx.scala │ │ │ ├── SampleData.scala │ │ │ ├── FilterInvalidPandas.scala │ │ │ └── GenerateScalingData.scala │ │ │ ├── wordcount │ │ │ └── WordCount.scala │ │ │ ├── errors │ │ │ └── throws.scala │ │ │ ├── perf │ │ │ └── SimplePerfTest.scala │ │ │ ├── goldilocks │ │ │ └── RDDJoinExamples.scala │ │ │ └── mllib │ │ │ └── GoldilocksMLlib.scala │ └── r │ │ ├── dapply.R │ │ └── wc.R │ └── test │ ├── scala │ └── com │ │ ├── highperformancespark │ │ └── examples │ │ │ └── JavaInteropHelper.scala │ │ └── high-performance-spark-examples │ │ ├── native │ │ ├── PipeExampleSuite.scala │ │ └── NativeExample.scala │ │ ├── errors │ │ └── ThrowsSuite.scala │ │ ├── goldilocks │ │ ├── JoinTest.scala │ │ ├── SortingTests.scala │ │ ├── EvaluationTests.scala │ │ └── GoldilocksLargeTests.scala │ │ ├── streaming │ │ └── DStreamSuite.scala │ │ ├── tokenize │ │ └── SampleTokenizeSuite.scala │ │ ├── ml │ │ ├── CustomPipeline.scala │ │ └── SimpleNaiveBayes.scala │ │ ├── mllib │ │ └── GoldilocksMLlibSuite.scala │ │ ├── tools │ │ ├── FilterInvalidPandasSuite.scala │ │ └── GenerateScalingDataSuite.scala │ │ ├── transformations │ │ └── Accumulators.scala │ │ ├── dataframe │ │ ├── PandaPlaceFilterPushdown.scala │ │ └── SQLExtensionTest.scala │ │ └── wordcount │ │ └── WordCountTest.scala │ └── java │ └── com │ └── highperformancespark │ └── examples │ └── JavaInteropTest.java ├── Dockerfile ├── accelerators ├── run_gluten.sh ├── install_rust_if_needed.sh ├── gluten_config.properties ├── comet_ex.sh ├── setup_gluten_deps.sh ├── setup_gluten_from_src.sh ├── gluten_spark_34_ex.sh ├── comet_env_setup.sh ├── setup_comet.sh ├── gluten_env_setup.sh └── setup_gluten_spark34.sh ├── migration └── sql.sh ├── shell-scripts └── launch-with-mysql-jdbc ├── conf ├── sbtconfig.txt └── log4j.properties ├── misc ├── container_launch.sh └── kernel.json ├── run_container.sh ├── se_simple.json ├── .scalafix.conf ├── appveyor.yml ├── target-validator ├── runme.sh └── ex.yaml ├── high_performance_pyspark └── __init__.py ├── se_complex.json ├── README.md ├── .gitignore ├── run_sql_examples.sh ├── env_setup.sh ├── run_pyspark_examples.sh └── Dockerfile-mini /c: -------------------------------------------------------------------------------- 1 | bloop 2 | 3 | -------------------------------------------------------------------------------- /python/examples/bad_pyspark.py.fail: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=1.9.9 2 | -------------------------------------------------------------------------------- /python/.flake8: -------------------------------------------------------------------------------- 1 | [flake8] 2 | max-line-length = 120 -------------------------------------------------------------------------------- /python/examples/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = 0.2 2 | -------------------------------------------------------------------------------- /python/examples/spark_expectations_example.py.fail: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /python/README.md: -------------------------------------------------------------------------------- 1 | Python examples for High Performance Spark 2 | -------------------------------------------------------------------------------- /sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /.jvmopts: -------------------------------------------------------------------------------- 1 | -Xms4096M 2 | -Xmx8096M 3 | -Xss2M 4 | -XX:MaxMetaspaceSize=4024M -------------------------------------------------------------------------------- /native/src/c/sum.h: -------------------------------------------------------------------------------- 1 | #ifndef _SUM_H 2 | #define _SUM_H 3 | 4 | int sum(int input[], int num_elem); 5 | 6 | #endif /* _SUM_H */ 7 | -------------------------------------------------------------------------------- /resources/rawpanda.json: -------------------------------------------------------------------------------- 1 | {"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true, 2 | "attributes":[0.4,0.5]}]} 3 | -------------------------------------------------------------------------------- /data/project.csv: -------------------------------------------------------------------------------- 1 | creator,projectname,stars 2 | holdenk,spark-upgrade,17 3 | krisnova,rust-nova,71 4 | kbendick,MongoMart,6 5 | mateiz,spark,36600 -------------------------------------------------------------------------------- /native/src/fortran/sumf.f95: -------------------------------------------------------------------------------- 1 | INTEGER FUNCTION SUMF(N,A) BIND(C, NAME='sumf') 2 | INTEGER A(N) 3 | SUMF=SUM(A) 4 | END 5 | -------------------------------------------------------------------------------- /core/src/main/julia/setup.jl: -------------------------------------------------------------------------------- 1 | Pkg.clone("https://github.com/dfdx/Spark.jl") 2 | Pkg.build("Spark") 3 | # we also need latest master of JavaCall.jl 4 | Pkg.checkout("JavaCall") -------------------------------------------------------------------------------- /resources/mysql-connector-java-5.1.38.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/high-performance-spark/high-performance-spark-examples/HEAD/resources/mysql-connector-java-5.1.38.jar -------------------------------------------------------------------------------- /core/src/main/perl/Changes: -------------------------------------------------------------------------------- 1 | Revision history for HighPerformanceSpark-Examples 2 | 3 | 0.01 Date/time 4 | First version, released on an unsuspecting world. 5 | 6 | -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | ARG base 2 | FROM $base 3 | 4 | USER root 5 | RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro fastavro 6 | USER dev 7 | RUN sbt clean compile 8 | -------------------------------------------------------------------------------- /native/src/c/sum.c: -------------------------------------------------------------------------------- 1 | #include "sum.h" 2 | 3 | int sum(int input[], int num_elem) { 4 | int c, ret = 0; 5 | for (c = 0; c < num_elem; c++) { 6 | ret += input[c]; 7 | } 8 | return ret; 9 | } 10 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools >= 58.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [[tool.mypy.overrides]] 6 | module = "examples" 7 | ignore_missing_imports = true 8 | -------------------------------------------------------------------------------- /core/src/main/perl/MANIFEST: -------------------------------------------------------------------------------- 1 | Changes 2 | lib/HighPerformanceSpark/Examples.pm 3 | Makefile.PL 4 | MANIFEST This list of files 5 | README 6 | t/00-load.t 7 | t/manifest.t 8 | t/pod-coverage.t 9 | t/pod.t 10 | -------------------------------------------------------------------------------- /python/requirements.txt: -------------------------------------------------------------------------------- 1 | spark-testing-base 2 | pandas 3 | pyarrow 4 | pyspark==3.5.0 5 | pyspark-asyncactions 6 | pandera 7 | pandera[pyspark] 8 | spark-expectations>=1.0 9 | venv-pack 10 | requests 11 | numpy<2.0 12 | -------------------------------------------------------------------------------- /accelerators/run_gluten.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | "${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar" --spark-properties=gluten_config.properties 4 | -------------------------------------------------------------------------------- /native/src/c/sumf_wrapper.c: -------------------------------------------------------------------------------- 1 | // Fortran routine 2 | extern int sumf(int *, int[]); 3 | 4 | // Call the fortran code which expects by reference size 5 | int wrap_sum(int input[], int size) { 6 | return sumf(&size, input); 7 | } 8 | -------------------------------------------------------------------------------- /core/src/main/julia/wc.jl: -------------------------------------------------------------------------------- 1 | using Spark 2 | sc = SparkContext(master="local") 3 | path = string("file:///", ENV["SPARK_HOME"], "/README.md") 4 | txt = text_file(sc, path) 5 | # Normally we would use a flatmap, but currently only has map_partitions 6 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ffi; 2 | 3 | // tag::sumJNIJava[] 4 | class SumJNIJava { 5 | public static native Integer sum(Integer[] array); 6 | } 7 | // end::sumJNIJava[] 8 | -------------------------------------------------------------------------------- /migration/sql.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | pip install sqlfluff 4 | python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql' 5 | 6 | sqlfluff rules |grep -i spark 7 | sqlfluff fix --dialect sparksql farts.sql 8 | -------------------------------------------------------------------------------- /shell-scripts/launch-with-mysql-jdbc: -------------------------------------------------------------------------------- 1 | ASSEMBLY_JAR=./target/scala-2.10/examples_2.10.jar 2 | CLASS="com.highperformancespark.dataframe.mysqlload" 3 | #tag:[submit] 4 | spark-submit --jars ./resources/mysql-connector-java-5.1.38.jar $ASSEMBLY_JAR $CLASS 5 | #end:[submit] -------------------------------------------------------------------------------- /accelerators/install_rust_if_needed.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ -f "$HOME/.cargo/env" ]; then 3 | source "$HOME/.cargo/env" 4 | fi 5 | 6 | if ! command -v cargo; then 7 | curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y 8 | source "$HOME/.cargo/env" 9 | fi 10 | -------------------------------------------------------------------------------- /accelerators/gluten_config.properties: -------------------------------------------------------------------------------- 1 | spark.plugins=io.glutenproject.GlutenPlugin 2 | spark.memory.offHeap.enabled=true 3 | spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager 4 | # This static allocation is one of the hardest part of using Gluten 5 | spark.memory.offHeap.size=20g 6 | -------------------------------------------------------------------------------- /conf/sbtconfig.txt: -------------------------------------------------------------------------------- 1 | 2 | # Set the java args to high 3 | 4 | -Xmx2048M 5 | 6 | -XX:MaxPermSize=2048m 7 | 8 | -XX:ReservedCodeCacheSize=128m 9 | 10 | -XX:+CMSClassUnloadingEnabled 11 | 12 | # Set the extra SBT options 13 | 14 | -Dsbt.log.format=true 15 | 16 | # JNA 17 | 18 | -Djna.nosys=true 19 | -------------------------------------------------------------------------------- /misc/container_launch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | if [ ! -f /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb ]; then 3 | cp /high-performance-spark-examples/iceberg-workshop-solutions/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb 4 | fi 5 | jupyter-lab --ip 0.0.0.0 --port 8877 6 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ffi 2 | 3 | // tag::sumJNA[] 4 | import com.sun.jna._ 5 | object SumJNA { 6 | Native.register("high-performance-spark0") 7 | @native def sum(n: Array[Int], size: Int): Int 8 | } 9 | // end::sumJNA[] 10 | -------------------------------------------------------------------------------- /core/src/main/perl/ignore.txt: -------------------------------------------------------------------------------- 1 | Makefile 2 | Makefile.old 3 | Build 4 | Build.bat 5 | META.* 6 | MYMETA.* 7 | .build/ 8 | _build/ 9 | cover_db/ 10 | blib/ 11 | inc/ 12 | .lwpcookies 13 | .last_cover_stats 14 | nytprof.out 15 | pod2htm*.tmp 16 | pm_to_blib 17 | HighPerformanceSpark-Examples-* 18 | HighPerformanceSpark-Examples-*.tar.gz 19 | -------------------------------------------------------------------------------- /core/src/main/perl/t/00-load.t: -------------------------------------------------------------------------------- 1 | #!perl -T 2 | use 5.006; 3 | use strict; 4 | use warnings; 5 | use Test::More; 6 | 7 | plan tests => 1; 8 | 9 | BEGIN { 10 | use_ok( 'HighPerformanceSpark::Examples' ) || print "Bail out!\n"; 11 | } 12 | 13 | diag( "Testing HighPerformanceSpark::Examples $HighPerformanceSpark::Examples::VERSION, Perl $], $^X" ); 14 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ffi 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | object NativeExample { 6 | def jniSum(input: RDD[(String, Array[Int])]): RDD[(String, Int)] = { 7 | input.mapValues(values => new SumJNI().sum(values)) 8 | } 9 | } 10 | -------------------------------------------------------------------------------- /sql/partioned_table_join.sql.conf: -------------------------------------------------------------------------------- 1 | --conf spark.sql.sources.v2.bucketing.enabled=true 2 | --conf spark.sql.iceberg.planning.preserve-data-grouping=true 3 | --conf spark.sql.requireAllClusterKeysForCoPartition=false 4 | 5 | --conf spark.sql.adaptive.enabled=false 6 | --conf spark.sql.autoBroadcastJoinThreshold=-1 7 | --conf spark.sql.shuffle.partitions=4 8 | -------------------------------------------------------------------------------- /core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples 2 | 3 | 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | class JavaInteropTestHelper(sc: SparkContext) { 8 | def generateMiniPairRDD(): RDD[(String, Long)] = { 9 | sc.parallelize(List(("panda", 12L))) 10 | } 11 | } 12 | -------------------------------------------------------------------------------- /sql/nonpartitioned_table_join.sql.conf: -------------------------------------------------------------------------------- 1 | --conf spark.sql.sources.v2.bucketing.enabled=true 2 | --conf spark.sql.iceberg.planning.preserve-data-grouping=true 3 | --conf spark.sql.requireAllClusterKeysForCoPartition=false 4 | 5 | --conf spark.sql.adaptive.enabled=false 6 | --conf spark.sql.autoBroadcastJoinThreshold=-1 7 | --conf spark.sql.shuffle.partitions=4 8 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ffi 2 | 3 | import com.github.sbt.jni.nativeLoader 4 | 5 | //tag::sumJNIDecorator[] 6 | @nativeLoader("high-performance-spark0") 7 | //end::sumJNIDecorator[] 8 | // tag::sumJNI[] 9 | class SumJNI { 10 | @native def sum(n: Array[Int]): Int 11 | } 12 | // end::sumJNI[] 13 | -------------------------------------------------------------------------------- /accelerators/comet_ex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | # If you change this update the workflow version too. 5 | SPARK_MAJOR=${SPARK_MAJOR:-3.5} 6 | SPARK_VERSION=${SPARK_MAJOR}.1 7 | export SPARK_MAJOR 8 | export SPARK_VERSION 9 | 10 | source setup_comet.sh 11 | pushd .. 12 | source ./env_setup.sh 13 | popd 14 | source comet_env_setup.sh 15 | pushd .. 16 | USE_COMET="true" ./run_sql_examples.sh 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ml 2 | 3 | import org.apache.spark.ml.classification._ 4 | 5 | object SimpleExport { 6 | //tag::exportLR[] 7 | def exportLRToCSV(model: LogisticRegressionModel) = { 8 | (model.coefficients.toArray :+ model.intercept).mkString(",") 9 | } 10 | //end::exportLR[] 11 | } 12 | -------------------------------------------------------------------------------- /core/src/main/perl/t/manifest.t: -------------------------------------------------------------------------------- 1 | #!perl -T 2 | use 5.006; 3 | use strict; 4 | use warnings; 5 | use Test::More; 6 | 7 | unless ( $ENV{RELEASE_TESTING} ) { 8 | plan( skip_all => "Author tests not required for installation" ); 9 | } 10 | 11 | my $min_tcm = 0.9; 12 | eval "use Test::CheckManifest $min_tcm"; 13 | plan skip_all => "Test::CheckManifest $min_tcm required" if $@; 14 | 15 | ok_manifest(); 16 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ffi 2 | 3 | object StandAlone { 4 | // $COVERAGE-OFF$ 5 | def main(args: Array[String]) { 6 | //tag::systemLoadLibrary[] 7 | System.loadLibrary("highPerformanceSpark0") 8 | //end::systemLoadLibrary[] 9 | println(new SumJNI().sum(Array(1,2,3))) 10 | } 11 | // $COVERAGE-ON$ 12 | } 13 | -------------------------------------------------------------------------------- /core/src/main/perl/t/pod.t: -------------------------------------------------------------------------------- 1 | #!perl -T 2 | use 5.006; 3 | use strict; 4 | use warnings; 5 | use Test::More; 6 | 7 | unless ( $ENV{RELEASE_TESTING} ) { 8 | plan( skip_all => "Author tests not required for installation" ); 9 | } 10 | 11 | # Ensure a recent version of Test::Pod 12 | my $min_tp = 1.22; 13 | eval "use Test::Pod $min_tp"; 14 | plan skip_all => "Test::Pod $min_tp required for testing POD" if $@; 15 | 16 | all_pod_files_ok(); 17 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ffi 2 | 3 | // tag::sumFJNA[] 4 | import com.sun.jna._ 5 | import com.sun.jna.ptr._ 6 | object SumFJNA { 7 | Native.register("high-performance-spark0") 8 | @native def sumf(n: IntByReference, a: Array[Int]): Int 9 | def easySum(size: Int, a: Array[Int]): Int = { 10 | val ns = new IntByReference(size) 11 | sumf(ns, a) 12 | } 13 | } 14 | // end::sumFJNA[] 15 | -------------------------------------------------------------------------------- /run_container.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | VERSION=${VERSION:-0.5} 4 | IMAGE=${IMAGE:-holdenk/hps:$VERSION} 5 | export VERSION 6 | export IMAGE 7 | docker image pull "$IMAGE" 8 | mkdir -p warehouse 9 | mkdir -p iceberg-workshop 10 | docker container run --mount type=bind,source="$(pwd)"/warehouse,target=/high-performance-spark-examples/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -p 4040:4040 -it "${IMAGE}" # /bin/bash 11 | -------------------------------------------------------------------------------- /python/examples/test_load_previous_run_data.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.session import SparkSession 2 | import os 3 | import tempfile 4 | 5 | from sparktestingbase.sqltestcase import SQLTestCase 6 | from .load_previous_run_data import LoadPreviousRunData 7 | 8 | 9 | class TestLoadPreviousRunData(SQLTestCase): 10 | def test_do_magic(self): 11 | lprd = LoadPreviousRunData(self.session) 12 | try: 13 | lprd.do_magic() 14 | except FileNotFoundError: 15 | print("No previous jobs") 16 | -------------------------------------------------------------------------------- /se_simple.json: -------------------------------------------------------------------------------- 1 | {"product_id": "pay", "table_name": "local.fake_table_name", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} 2 | -------------------------------------------------------------------------------- /native/src/c/sum_wrapper.c: -------------------------------------------------------------------------------- 1 | #include "sum.h" 2 | #include "include/com_highperformancespark_examples_ffi_SumJNI.h" 3 | #include 4 | #include 5 | 6 | /* 7 | * Class: com_highperformancespark_examples_ffi_SumJNI 8 | * Method: sum 9 | * Signature: ([I)I 10 | */ 11 | JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum 12 | (JNIEnv *env, jobject obj, jintArray ja) { 13 | jsize size = (*env)->GetArrayLength(env, ja); 14 | jint *a = (*env)->GetIntArrayElements(env, ja, 0); 15 | return sum(a, size); 16 | } 17 | -------------------------------------------------------------------------------- /sql/nonpartitioned_table_join.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS local.udevelopers ( 2 | username string, 3 | firstname string, 4 | lastname string) 5 | USING iceberg; 6 | CREATE TABLE IF NOT EXISTS local.uprojects ( 7 | creator string, 8 | uprojectname string) 9 | USING iceberg; 10 | INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); 11 | INSERT INTO local.uprojects VALUES("krisnova", "aurae"); 12 | SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; 13 | -------------------------------------------------------------------------------- /core/src/main/perl/ghinfo.pl: -------------------------------------------------------------------------------- 1 | #!/usr/bin/perl 2 | use strict; 3 | use warnings; 4 | 5 | use Pithub; 6 | use Data::Dumper; 7 | 8 | # Find all of the commentors on an issue 9 | my $user = $ENV{'user'}; 10 | my $repo = $ENV{'repo'}; 11 | my $p = Pithub->new(user => $user, repo => $repo); 12 | while (my $id = <>) { 13 | chomp ($id); 14 | my $issue_comments = $p->issues->comments->list(issue_id => $id); 15 | print $id; 16 | while (my $comment = $issue_comments->next) { 17 | print " ".$comment->{"user"}->{"login"}; 18 | } 19 | print "\n"; 20 | } 21 | -------------------------------------------------------------------------------- /sql/gluten_only_nonpartitioned_table_join.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS local.udevelopers ( 2 | username string, 3 | firstname string, 4 | lastname string) 5 | USING iceberg; 6 | CREATE TABLE IF NOT EXISTS local.uprojects ( 7 | creator string, 8 | uprojectname string) 9 | USING iceberg; 10 | INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); 11 | INSERT INTO local.uprojects VALUES("krisnova", "aurae"); 12 | SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; 13 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.tokenize 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | object SampleTokenize { 6 | //tag::DIFFICULT[] 7 | def difficultTokenizeRDD(input: RDD[String]) = { 8 | input.flatMap(_.split(" ")) 9 | } 10 | //end::DIFFICULT[] 11 | 12 | //tag::EASY[] 13 | def tokenizeRDD(input: RDD[String]) = { 14 | input.flatMap(tokenize) 15 | } 16 | 17 | protected[tokenize] def tokenize(input: String) = { 18 | input.split(" ") 19 | } 20 | //end::EASY[] 21 | } 22 | -------------------------------------------------------------------------------- /sql/partioned_table_join.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS local.developers ( 2 | username string, 3 | firstname string, 4 | lastname string) 5 | USING iceberg 6 | PARTITIONED BY (username); 7 | CREATE TABLE IF NOT EXISTS local.projects ( 8 | creator string, 9 | projectname string) 10 | USING iceberg 11 | PARTITIONED BY (creator); 12 | INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova"); 13 | INSERT INTO local.projects VALUES("krisnova", "aurae"); 14 | SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username; 15 | -------------------------------------------------------------------------------- /accelerators/setup_gluten_deps.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | sudo apt-get update 5 | #tag::gluten_deps[] 6 | sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential \ 7 | llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev \ 8 | libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev \ 9 | libsodium-dev libsnappy-dev nasm 10 | sudo apt install -y libunwind-dev 11 | sudo apt-get install -y libgoogle-glog-dev 12 | sudo apt-get -y install docker-compose 13 | sudo apt-get install -y libre2-9 || sudo apt-get install -y libre2-10 14 | #end::gluten_deps[] 15 | -------------------------------------------------------------------------------- /.scalafix.conf: -------------------------------------------------------------------------------- 1 | UnionRewrite.deprecatedMethod { 2 | "unionAll" = "union" 3 | } 4 | 5 | OrganizeImports { 6 | blankLines = Auto, 7 | groups = [ 8 | "re:javax?\\." 9 | "scala." 10 | "org.apache.spark." 11 | "*" 12 | ], 13 | removeUnused = false 14 | } 15 | 16 | rules = [ 17 | DisableSyntax, 18 | SparkAutoUpgrade, 19 | MigrateHiveContext, 20 | MigrateToSparkSessionBuilder, 21 | MigrateDeprecatedDataFrameReaderFuns, 22 | AccumulatorUpgrade, 23 | onFailureFix, 24 | ExecutorPluginWarn, 25 | UnionRewrite, 26 | GroupByKeyWarn, 27 | GroupByKeyRewrite, 28 | MetadataWarnQQ, 29 | ScalaTestExtendsFix, 30 | ScalaTestImportChange 31 | ] -------------------------------------------------------------------------------- /appveyor.yml: -------------------------------------------------------------------------------- 1 | version: '{build}' 2 | 3 | platform: 4 | - x86 5 | - x64 6 | 7 | environment: 8 | matrix: 9 | - JAVA_HOME: C:\Program Files\Java\jdk1.8.0 10 | 11 | 12 | install: 13 | - ps: Start-FileDownload 'http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/0.13.9/sbt-launch.jar' 14 | - xcopy sbt-launch.jar sbt\ 15 | - del build.sbt 16 | - copy build_windows.sbt build.sbt 17 | 18 | build_script: 19 | - sbt\sbt clean compile 20 | 21 | test_script: 22 | - sbt\sbt "testOnly com.highperformancespark.examples.tools.FilterInvalidPandasSuite" 23 | 24 | cache: 25 | - C:\Users\appveyor\.ivy2 26 | - C:\Users\appveyor\.m2 27 | - C:\Users\appveyor\.sbt -------------------------------------------------------------------------------- /accelerators/setup_gluten_from_src.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | 4 | # Setup deps 5 | source setup_gluten_deps.sh 6 | 7 | # Try gluten w/clickhouse 8 | #if [ ! -d gluten ]; then 9 | # git clone https://github.com/oap-project/gluten.git 10 | # cd gluten 11 | # bash ./ep/build-clickhouse/src/build_clickhouse.sh 12 | #fi 13 | 14 | # Build gluten 15 | if [ ! -d gluten ]; then 16 | # We need Spark 3.5 w/scala212 17 | git clone git@github.com:holdenk/gluten.git || git clone https://github.com/holdenk/gluten.git 18 | cd gluten 19 | git checkout add-spark35-scala213-hack 20 | ./dev/builddeps-veloxbe.sh 21 | mvn clean package -Pbackends-velox -Pspark-3.5 -DskipTests 22 | cd .. 23 | fi 24 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.objects; 2 | 3 | import java.io.Serializable; 4 | 5 | public class JavaCoffeeShop implements Serializable { 6 | private String zip; 7 | private String name; 8 | 9 | public JavaCoffeeShop(String zip, String name) { 10 | this.zip = zip; 11 | this.name = name; 12 | } 13 | 14 | public String getZip() { 15 | return zip; 16 | } 17 | 18 | public void setZip(String zip) { 19 | this.zip = zip; 20 | } 21 | 22 | public String getName() { 23 | return name; 24 | } 25 | 26 | public void setName(String name) { 27 | this.name = name; 28 | } 29 | } -------------------------------------------------------------------------------- /native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h: -------------------------------------------------------------------------------- 1 | /* DO NOT EDIT THIS FILE - it is machine generated */ 2 | #include 3 | /* Header for class com_highperformancespark_examples_ffi_SumJNI */ 4 | 5 | #ifndef _Included_com_highperformancespark_examples_ffi_SumJNI 6 | #define _Included_com_highperformancespark_examples_ffi_SumJNI 7 | #ifdef __cplusplus 8 | extern "C" { 9 | #endif 10 | /* 11 | * Class: com_highperformancespark_examples_ffi_SumJNI 12 | * Method: sum 13 | * Signature: ([I)I 14 | */ 15 | JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum 16 | (JNIEnv *, jobject, jintArray); 17 | 18 | #ifdef __cplusplus 19 | } 20 | #endif 21 | #endif 22 | -------------------------------------------------------------------------------- /accelerators/gluten_spark_34_ex.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 6 | cd "${SCRIPT_DIR}" 7 | source "${SCRIPT_DIR}/setup_gluten_spark34.sh" 8 | 9 | export SPARK_HOME 10 | PATH="$(pwd)/${SPARK_DIR}/bin:$PATH" 11 | export PATH 12 | "${SPARK_HOME}/bin/spark-sql" --master local[5] \ 13 | --conf spark.plugins=io.glutenproject.GlutenPlugin \ 14 | --conf spark.memory.offHeap.enabled=true \ 15 | --conf spark.memory.offHeap.size=5g \ 16 | --jars "${GLUTEN_JAR}" \ 17 | --conf spark.eventLog.enabled=true \ 18 | -e "SELECT 1" 19 | 20 | source gluten_env_setup.sh 21 | cd .. 22 | ./run_sql_examples.sh || echo "Expected to fail" 23 | -------------------------------------------------------------------------------- /sql/iceberg-schema-evolution-gotcha-possibility.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS local.udevelopers_sorted; 2 | CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( 3 | username string, 4 | firstname string, 5 | lastname string) 6 | USING ICEBERG; 7 | INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova"); 8 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; 9 | ALTER TABLE local.udevelopers_sorted RENAME COLUMN lastname TO deprecated_lastname; 10 | SELECT * FROM local.udevelopers_sorted; 11 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; 12 | ALTER TABLE local.udevelopers_sorted DROP COLUMN deprecated_lastname; 13 | SELECT * FROM local.udevelopers_sorted; 14 | 15 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Test our simple JNI 3 | */ 4 | package com.highperformancespark.examples.ffi 5 | 6 | import com.holdenkarau.spark.testing._ 7 | import org.scalatest.funsuite.AnyFunSuite 8 | import org.scalatest.matchers.should.Matchers._ 9 | import org.scalatestplus.scalacheck.Checkers 10 | 11 | 12 | class PipeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers { 13 | ignore("commentors on a pr") { 14 | val rdd = sc.parallelize(List(12883)) 15 | val expected = (12883, List("SparkQA", "srowen")) 16 | val result = PipeExample.lookupUserPRS(sc, rdd) 17 | assert(expected === result.collect()(0)) 18 | } 19 | } 20 | -------------------------------------------------------------------------------- /target-validator/runme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC1091,SC2034 3 | 4 | source ../env_setup.sh 5 | set -ex 6 | export SPARK_VERSION="${SPARK_VERSION:-3.4.1}" 7 | 8 | # Disable for now until the target folks agree on the PR nested builds are slow. 9 | exit 0 10 | 11 | git clone git@github.com:holdenk/data-validator.git || git clone https://github.com/holdenk/data-validator.git 12 | cd data-validator 13 | git checkout upgrade-to-modern-spark 14 | sbt -Dspark="${SPARK_VERSION}" clean assembly 15 | JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar" 16 | export JAR_PATH 17 | cd .. 18 | "${SPARK_HOME}/bin/spark-submit" --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected." 19 | -------------------------------------------------------------------------------- /misc/kernel.json: -------------------------------------------------------------------------------- 1 | { 2 | "argv": [ 3 | "java", 4 | "-cp", 5 | "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.2-bin-hadoop3-scala2.13/jars/*", 6 | "coursier.bootstrap.launcher.Launcher", 7 | "--log", 8 | "info", 9 | "--metabrowse", 10 | "--id", 11 | "scala2.13", 12 | "--display-name", 13 | "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", 14 | "--connection-file", 15 | "{connection_file}" 16 | ], 17 | "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", 18 | "language": "scala" 19 | } 20 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Extension for the SparkSession to allow us to plug in a custom optimizer 3 | */ 4 | 5 | package com.highperformancespark.examples.dataframe 6 | 7 | import org.apache.spark.sql.{SparkSessionExtensions, SparkSessionExtensionsProvider} 8 | 9 | class SQLExtension extends SparkSessionExtensionsProvider { 10 | override def apply(extensions: SparkSessionExtensions): Unit = { 11 | // There are _many different_ types of rules you can inject, here we're focused on 12 | // making things go fast so our sample is an optimizer rule (AQE rules could also make sense). 13 | extensions.injectOptimizerRule(session => NullabilityFilterOptimizer) 14 | } 15 | } 16 | -------------------------------------------------------------------------------- /core/src/main/perl/t/pod-coverage.t: -------------------------------------------------------------------------------- 1 | #!perl -T 2 | use 5.006; 3 | use strict; 4 | use warnings; 5 | use Test::More; 6 | 7 | unless ( $ENV{RELEASE_TESTING} ) { 8 | plan( skip_all => "Author tests not required for installation" ); 9 | } 10 | 11 | # Ensure a recent version of Test::Pod::Coverage 12 | my $min_tpc = 1.08; 13 | eval "use Test::Pod::Coverage $min_tpc"; 14 | plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage" 15 | if $@; 16 | 17 | # Test::Pod::Coverage doesn't require a minimum Pod::Coverage version, 18 | # but older versions don't recognize some common documentation styles 19 | my $min_pc = 0.18; 20 | eval "use Pod::Coverage $min_pc"; 21 | plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage" 22 | if $@; 23 | 24 | all_pod_coverage_ok(); 25 | -------------------------------------------------------------------------------- /sql/iceberg-schema-evolution-gotcha-workaround.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS local.udevelopers_sorted; 2 | CREATE TABLE IF NOT EXISTS local.udevelopers_sorted ( 3 | username string, 4 | firstname string, 5 | lastname string) 6 | USING ICEBERG; 7 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname; 8 | INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova"); 9 | SELECT * FROM local.udevelopers_sorted; 10 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username; 11 | -- Hack, add it to identifier fields so we can do a "partial" drop where it stays in the schema and we don't 12 | -- corrupt the metadata. 13 | ALTER TABLE local.udevelopers_sorted ADD PARTITION FIELD lastname; 14 | ALTER TABLE local.udevelopers_sorted DROP PARTITION FIELD lastname; 15 | SELECT * FROM local.udevelopers_sorted; 16 | -------------------------------------------------------------------------------- /accelerators/comet_env_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | SPARK_EXTRA=" 4 | --jars ${COMET_JAR} \ 5 | --driver-class-path ${COMET_JAR} \ 6 | --conf spark.comet.enabled=true \ 7 | --conf spark.comet.exec.enabled=true \ 8 | --conf spark.comet.exec.all.enabled=true \ 9 | --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ 10 | --conf spark.comet.exec.shuffle.enabled=true \ 11 | --conf spark.comet.columnar.shuffle.enabled=true" 12 | # Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set 13 | # EXTRA_EXTENSIONS so it can be appended to iceberg 14 | if [ -z "$EXTRA_EXTENSIONS" ]; then 15 | EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions" 16 | else 17 | EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions,$EXTRA_EXTENSIONS" 18 | fi 19 | export EXTRA_EXTENSIONS 20 | export SPARK_EXTRA 21 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.objects; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | public class JavaPandaPlace implements Serializable { 7 | private String name; 8 | private List pandas; 9 | 10 | /** 11 | * @param name place name 12 | * @param pandas pandas in that place 13 | */ 14 | public JavaPandaPlace(String name, List pandas) { 15 | this.name = name; 16 | this.pandas = pandas; 17 | } 18 | 19 | public String getName() { 20 | return name; 21 | } 22 | 23 | public void setName(String name) { 24 | this.name = name; 25 | } 26 | 27 | public List getPandas() { 28 | return pandas; 29 | } 30 | 31 | public void setPandas(List pandas) { 32 | this.pandas = pandas; 33 | } 34 | } -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Using plain-old-sql 3 | */ 4 | package com.highperformancespark.examples.dataframe 5 | 6 | import org.apache.spark.sql._ 7 | 8 | case class RegularSQL(sqlContext: SQLContext) { 9 | 10 | //tag::queryTable[] 11 | def querySQL(): DataFrame = { 12 | sqlContext.sql("SELECT * FROM pandas WHERE size > 0") 13 | } 14 | //end::queryTable[] 15 | 16 | // TODO: Holden: include a parquet example file and point this to that. 17 | //tag::queryRawFile[] 18 | def queryRawFile(): DataFrame = { 19 | sqlContext.sql("SELECT * FROM parquet.`path_to_parquet_file`") 20 | } 21 | //end::queryRawFile[] 22 | 23 | //tag::registerTable[] 24 | def registerTable(df: DataFrame): Unit = { 25 | df.registerTempTable("pandas") 26 | df.write.saveAsTable("perm_pandas") 27 | } 28 | //end::registerTable[] 29 | } 30 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0") 2 | 3 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/" 4 | 5 | resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/" 6 | 7 | 8 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2") 9 | 10 | addDependencyTreePlugin 11 | 12 | //tag::scalaFix[] 13 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.12.1") 14 | //end::scalaFix[] 15 | 16 | //tag::sbtJNIPlugin[] 17 | addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.7.0") 18 | //end::sbtJNIPlugin[] 19 | 20 | //tag::xmlVersionConflict[] 21 | // See https://github.com/scala/bug/issues/12632 22 | ThisBuild / libraryDependencySchemes ++= Seq( 23 | "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always 24 | ) 25 | //end::xmlVersionConflict[] 26 | 27 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0") 28 | -------------------------------------------------------------------------------- /python/examples/dual_write.py: -------------------------------------------------------------------------------- 1 | import asyncactions # noqa # pylint: disable=unused-import 2 | 3 | 4 | class DualWriteExample: 5 | def do_write(self, df, p1, p2): 6 | """ 7 | Apply two concrete actions to a DataFrame in parallel. 8 | A common use case is two views of the same data, normally 9 | one with sensitive data and one scrubbed/clean. 10 | """ 11 | # First we "persist" it (you can also checkpoint or choose a different 12 | # level of persistence. 13 | df.persist() 14 | df.count() 15 | # Create the distinct "safe" view. 16 | df1 = df.select("times") 17 | # Start the async actions 18 | async1 = df1.write.mode("append").format("parquet").saveAsync(p1) 19 | async2 = df.write.mode("append").format("parquet").saveAsync(p2) 20 | # Block until the writes are both finished. 21 | async1.result() 22 | async2.result() 23 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.errors 2 | 3 | import com.holdenkarau.spark.testing._ 4 | import org.scalatest.funsuite.AnyFunSuite 5 | 6 | class ThrowsSuite extends AnyFunSuite with SharedSparkContext { 7 | test("inner throw & outer throw should both throw SparkExceptions exceptions") { 8 | intercept[org.apache.spark.SparkException] { 9 | Throws.throwInner(sc) 10 | } 11 | intercept[org.apache.spark.SparkException] { 12 | Throws.throwOuter(sc) 13 | } 14 | intercept[org.apache.spark.SparkException] { 15 | Throws.throwInner2(sc) 16 | } 17 | intercept[org.apache.spark.SparkException] { 18 | Throws.throwOuter2(sc) 19 | } 20 | } 21 | 22 | test("loading missing data should throw") { 23 | intercept[org.apache.hadoop.mapred.InvalidInputException] { 24 | Throws.nonExistentInput(sc) 25 | } 26 | } 27 | } 28 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/WordCount.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples; 2 | 3 | //tag::wordCount[] 4 | import scala.Tuple2; 5 | 6 | import org.apache.spark.api.java.JavaRDD; 7 | import org.apache.spark.api.java.JavaPairRDD; 8 | import org.apache.spark.api.java.JavaSparkContext; 9 | 10 | import java.util.regex.Pattern; 11 | import java.util.Arrays; 12 | 13 | public final class WordCount { 14 | private static final Pattern pattern = Pattern.compile(" "); 15 | 16 | public static void main(String[] args) throws Exception { 17 | JavaSparkContext jsc = new JavaSparkContext(); 18 | JavaRDD lines = jsc.textFile(args[0]); 19 | JavaRDD words = lines.flatMap(e -> Arrays.asList( 20 | pattern.split(e)).iterator()); 21 | JavaPairRDD wordsIntial = words.mapToPair( 22 | e -> new Tuple2(e, 1)); 23 | } 24 | } 25 | //end::wordCount[] 26 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.goldilocks 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import com.holdenkarau.spark.testing.SharedSparkContext 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | 9 | class JoinTest extends AnyFunSuite with SharedSparkContext { 10 | test("Hash join"){ 11 | val keySet = "a, b, c, d, e, f, g".split(",") 12 | val smallRDD = sc.parallelize(keySet.map(letter => (letter, letter.hashCode))) 13 | val largeRDD: RDD[(String, Double)] = 14 | sc.parallelize(keySet.flatMap{ letter => 15 | Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))}) 16 | val result: RDD[(String, (Double, Int))] = 17 | RDDJoinExamples.manualBroadcastHashJoin( 18 | largeRDD, smallRDD) 19 | val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD) 20 | 21 | assert(result.subtract(nativeJoin).count == 0) 22 | } 23 | } 24 | -------------------------------------------------------------------------------- /target-validator/ex.yaml: -------------------------------------------------------------------------------- 1 | detailedErrors: true 2 | numKeyCols: 4 3 | # We might have a large number of errors so just show the first 5 4 | numErrorsToReport: 5 5 | 6 | email: 7 | smtpHost: smtp.example.com 8 | subject: Data Validation Summary 9 | from: data-validator-no-reply@example.com 10 | to: 11 | - professor-timbit@example.com 12 | 13 | tables: 14 | - db: gender_paygaps 15 | table: uk 16 | # Columns that taken together uniquely specifies each row (think of groupBy) 17 | keyColumns: 18 | - CompanyNumber 19 | - EmployerId 20 | - CompanyLinkToGPGInfo 21 | - ResponsiblePerson 22 | # Used to filter 23 | condition: MaleBonusPercent >= FemaleBonusPercent 24 | checks: 25 | # We expect at least 500 records 26 | - type: rowCount 27 | minNumRows: 500 28 | # We don't expect more than 1% not companies in the dataset. 29 | - type: nullCheck 30 | column: CompanyNumber 31 | threshold: 0.01 32 | -------------------------------------------------------------------------------- /accelerators/setup_comet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | source install_rust_if_needed.sh 5 | 6 | if command -v protoc >/dev/null 2>&1; then 7 | echo "protoc already installed" 8 | else 9 | sudo apt-get install -y protobuf-compiler 10 | fi 11 | 12 | if [ -z "${SPARK_MAJOR}" ]; then 13 | echo "Need a spark major version specified." 14 | exit 1 15 | else 16 | echo "Building comet for Spark ${SPARK_MAJOR}" 17 | fi 18 | 19 | #tag::build[] 20 | # If we don't have fusion checked out do it 21 | if [ ! -d arrow-datafusion-comet ]; then 22 | git clone https://github.com/apache/arrow-datafusion-comet.git 23 | fi 24 | 25 | # Build JAR if not present 26 | if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then 27 | cd arrow-datafusion-comet 28 | make clean release PROFILES="-Pspark-${SPARK_MAJOR} -Pscala-2.13" 29 | cd .. 30 | fi 31 | COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" 32 | export COMET_JAR 33 | #end::build[] 34 | -------------------------------------------------------------------------------- /high_performance_pyspark/__init__.py: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | 19 | """ 20 | Python version of selected examples from High Performance Spark 21 | """ 22 | 23 | import os 24 | import sys 25 | -------------------------------------------------------------------------------- /python/examples/test_dual_write.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | # tag::test[] 5 | from sparktestingbase.sqltestcase import SQLTestCase 6 | from pyspark.sql.functions import current_timestamp 7 | from pyspark.sql.types import Row 8 | from .dual_write import DualWriteExample 9 | 10 | 11 | class DualWriteTest(SQLTestCase): 12 | def test_always_passes(self): 13 | self.assertTrue(True) 14 | 15 | def test_actual_dual_write(self): 16 | tempdir = tempfile.mkdtemp() 17 | p1 = os.path.join(tempdir, "data1") 18 | p2 = os.path.join(tempdir, "data2") 19 | df = self.sqlCtx.createDataFrame([Row("timbit"), Row("farted")], ["names"]) 20 | combined = df.withColumn("times", current_timestamp()) 21 | DualWriteExample().do_write(combined, p1, p2) 22 | df1 = self.sqlCtx.read.format("parquet").load(p1) 23 | df2 = self.sqlCtx.read.format("parquet").load(p2) 24 | self.assertDataFrameEqual(df2.select("times"), df1, 0.1) 25 | 26 | 27 | # end::test[] 28 | -------------------------------------------------------------------------------- /sql/wap.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS local.wap_projects; 2 | CREATE TABLE local.wap_projects ( 3 | creator string, 4 | projectname string) 5 | USING iceberg 6 | PARTITIONED BY (creator); 7 | ALTER TABLE local.projects SET TBLPROPERTIES ( 8 | 'write.wap.enabled''true' 9 | ); 10 | -- We need a first commit, see https://github.com/apache/iceberg/issues/8849 11 | INSERT INTO local.wap_projects VALUES("holdenk", "spark"); 12 | ALTER TABLE local.wap_projects DROP BRANCH IF EXISTS `audit-branch`; 13 | ALTER TABLE local.wap_projects CREATE BRANCH `audit-branch`; 14 | SET spark.wap.branch = 'audit-branch'; 15 | INSERT INTO local.projects VALUES("krisnova", "aurae"); 16 | SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator is NULL; 17 | SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator == "krisnova"; 18 | CALL local.system.remove_orphan_files(table => 'local.wap_projects'); 19 | CALL local.system.fast_forward("local.wap_projects", "main", "audit-branch"); 20 | -------------------------------------------------------------------------------- /core/src/main/perl/Makefile.PL: -------------------------------------------------------------------------------- 1 | use 5.006; 2 | use strict; 3 | use warnings; 4 | use ExtUtils::MakeMaker; 5 | 6 | WriteMakefile( 7 | NAME => 'HighPerformanceSpark::Examples', 8 | AUTHOR => q{Holden Karau And Rachel Warren }, 9 | VERSION_FROM => 'lib/HighPerformanceSpark/Examples.pm', 10 | ABSTRACT_FROM => 'lib/HighPerformanceSpark/Examples.pm', 11 | LICENSE => 'apache_2_0', 12 | PL_FILES => {}, 13 | EXE_FILES => [ 'ghinfo.pl' ], 14 | MIN_PERL_VERSION => 5.006, 15 | CONFIGURE_REQUIRES => { 16 | 'ExtUtils::MakeMaker' => 0, 17 | }, 18 | BUILD_REQUIRES => { 19 | 'Test::More' => 0, 20 | }, 21 | PREREQ_PM => { 22 | 'Pithub' => 0.01033, 23 | #'ABC' => 1.6, 24 | #'Foo::Bar::Module' => 5.0401, 25 | }, 26 | dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, 27 | clean => { FILES => 'HighPerformanceSpark-Examples-*' }, 28 | ); 29 | -------------------------------------------------------------------------------- /se_complex.json: -------------------------------------------------------------------------------- 1 | {"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} 2 | {"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) > (select input_count from local.dq_stats WHERE table_name='local.3rd_fake' LIMIT 1)", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1} 3 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple tests for DStreamSuite - 3 | * normally we would use streaming tests but since we want to test 4 | * context creation as well we don't. 5 | */ 6 | package com.highperformancespark.examples.streaming 7 | 8 | import java.lang.Thread 9 | 10 | import org.apache.spark.streaming._ 11 | 12 | import com.holdenkarau.spark.testing._ 13 | import org.scalatest.funsuite.AnyFunSuite 14 | 15 | class DStreamExamplesSuite extends AnyFunSuite with SharedSparkContext { 16 | test("simple set up") { 17 | val ssc = DStreamExamples.makeStreamingContext(sc) 18 | val inputStream = DStreamExamples.fileAPIExample(ssc, "./") 19 | val repartitioned = DStreamExamples.repartition(inputStream) 20 | repartitioned.foreachRDD(rdd => 21 | assert(rdd.partitioner.get.numPartitions == 20) 22 | ) 23 | ssc.start() 24 | // This is bad don't do this - but we don't have the full test tools here 25 | Thread.sleep(100) 26 | ssc.stop() 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /python/setup.cfg: -------------------------------------------------------------------------------- 1 | [metadata] 2 | name = examples 3 | version = attr: examples.__version__ 4 | author = Holden and Anya 5 | author_email = your@email.address 6 | url = https://github.com/high-performance-spark/high-performance-spark-examples 7 | description = Python Examples for High Performance Spark 8 | long_description = file: README.md 9 | long_description_content_type = text/markdown 10 | keywords = example, setuptools, pyspark 11 | license = BSD 3-Clause License 12 | classifiers = 13 | License :: OSI Approved :: BSD License 14 | Programming Language :: Python :: 3 15 | 16 | [options] 17 | packages = find: 18 | zip_safe = True 19 | include_package_data = True 20 | install_requires = 21 | pandas >= 1.4.1 22 | PyYAML >= 6.0 23 | typer 24 | mypy 25 | pyspark 26 | pyspark-asyncactions 27 | 28 | 29 | [options.entry_points] 30 | console_scripts = 31 | my-example-utility = example.example_module:main 32 | 33 | [options.extras_require] 34 | dev = 35 | black>=22.1.0 36 | flake8>=4.0.1 37 | 38 | [options.package_data] 39 | * = README.md -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.structuredstreaming 2 | 3 | import scala.concurrent.duration._ 4 | 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.streaming._ 8 | import org.apache.spark.sql.streaming.Trigger 9 | 10 | 11 | object Structured { 12 | def load(inputPath: String, session: SparkSession): Dataset[_] = { 13 | //tag::loadSimple[] 14 | session.readStream.parquet(inputPath) 15 | //end::loadSimple[] 16 | } 17 | def write(counts: Dataset[_]) = { 18 | //tag::writeComplete[] 19 | val query = counts.writeStream. 20 | // Specify the output mode as Complete to support aggregations 21 | outputMode(OutputMode.Complete()). 22 | // Write out the result as parquet 23 | format("parquet"). 24 | // Specify the interval at which new data will be picked up 25 | trigger(Trigger.ProcessingTime(1.second)). 26 | queryName("pandas").start() 27 | //end::writeComplete[] 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.transformations 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | 6 | object NarrowAndWide { 7 | 8 | //toDO: Probably should write some sort of test for this. 9 | //this is used in chapter 4 for the stage diagram 10 | def sillySparkProgram(rdd1 : RDD[Int]) = { 11 | 12 | //tag::narrowWide[] 13 | 14 | //Narrow dependency. Map the rdd to tuples of (x, 1) 15 | val rdd2 = rdd1.map(x => (x, 1)) 16 | //wide dependency groupByKey 17 | val rdd3 = rdd2.groupByKey() 18 | //end::narrowWide[] 19 | 20 | rdd3 21 | } 22 | //this is used in chapter two for the stage diagram. 23 | 24 | //tag::stageDiagram[] 25 | def simpleSparkProgram(rdd : RDD[Double]): Long ={ 26 | //stage1 27 | rdd.filter(_< 1000.0) 28 | .map(x => (x, x) ) 29 | //stage2 30 | .groupByKey() 31 | .map{ case(value, groups) => (groups.sum, value)} 32 | //stage 3 33 | .sortByKey() 34 | .count() 35 | } 36 | //end::stageDiagram[] 37 | 38 | } 39 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Extension for the SparkSession to allow us to plug in a custom optimizer 3 | */ 4 | 5 | package com.highperformancespark.examples.dataframe 6 | 7 | import org.apache.spark.sql.catalyst.optimizer._ 8 | import org.apache.spark.sql.catalyst.plans.logical._ 9 | import org.apache.spark.sql.catalyst.rules.Rule 10 | import org.apache.spark.sql.catalyst.trees.TreePattern._ 11 | import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull} 12 | 13 | object NullabilityFilterOptimizer extends Rule[LogicalPlan] { 14 | 15 | def apply(plan: LogicalPlan): LogicalPlan = { 16 | plan.transform { 17 | case p @ Project(projectList, projChild) => 18 | val children = projectList.flatMap(_.children) 19 | // If there are no null intolerant children don't worry about it 20 | if (children.isEmpty) { 21 | p 22 | } else { 23 | val filterCond = children.map(IsNotNull(_)).reduceLeft(And) 24 | Project(projectList, Filter(filterCond, projChild)) 25 | } 26 | } 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.dataframe 2 | 3 | import java.util.Arrays 4 | import java.util.Objects 5 | 6 | /** 7 | * @param id panda id 8 | * @param zip zip code of panda residence 9 | * @param pt Type of panda as a string 10 | * @param happy if panda is happy 11 | * @param attributes array of panada attributes 12 | */ 13 | case class RawPanda(id: Long, zip: String, pt: String, 14 | happy: Boolean, attributes: Array[Double]) { 15 | override def equals(o: Any) = o match { 16 | case other: RawPanda => (id == other.id && pt == other.pt && 17 | happy == other.happy && attributes.sameElements(other.attributes)) 18 | case _ => false 19 | } 20 | override def hashCode(): Int = { 21 | 3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) + 22 | 11 * Objects.hashCode(pt) + 13 * Arrays.hashCode(attributes) 23 | } 24 | } 25 | 26 | /** 27 | * @param name place name 28 | * @param pandas pandas in that place 29 | */ 30 | case class PandaPlace(name: String, pandas: Array[RawPanda]) 31 | 32 | case class CoffeeShop(zip: String, name: String) 33 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple tests for tokenization 3 | */ 4 | package com.highperformancespark.examples.tokenize 5 | 6 | import java.lang.Thread 7 | 8 | import org.apache.spark.streaming._ 9 | 10 | import com.holdenkarau.spark.testing._ 11 | import org.scalatest.funsuite.AnyFunSuite 12 | 13 | class SampleTokenizeSuite extends AnyFunSuite with SharedSparkContext { 14 | val input = List("hi holden", "I like coffee") 15 | val expected = List("hi", "holden", "I", "like", "coffee") 16 | 17 | test("test the difficult to test one") { 18 | val inputRDD = sc.parallelize(input) 19 | val result = SampleTokenize.difficultTokenizeRDD(inputRDD).collect() 20 | assert(result.toList == expected) 21 | } 22 | 23 | test("test the easy to test one like the difficult one") { 24 | val inputRDD = sc.parallelize(input) 25 | val result = SampleTokenize.tokenizeRDD(inputRDD).collect() 26 | assert(result.toList == expected) 27 | } 28 | 29 | test("test the easy inner function - note no SC needed") { 30 | assert(SampleTokenize.tokenize("hi holden").toList == List("hi", "holden")) 31 | } 32 | } 33 | -------------------------------------------------------------------------------- /python/examples/load_previous_run_data.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | 5 | class LoadPreviousRunData(object): 6 | def __init__(self, session): 7 | self.session = session 8 | 9 | def find_oldest_id(self, local_path): 10 | """Find the oldest Spark job since it's probably not being updated.""" 11 | directories = os.listdir(local_path) 12 | return min(directories, key=lambda x: os.path.getmtime(f"{local_path}/{x}")) 13 | 14 | def do_magic(self): 15 | local_path = "/tmp/spark-events" 16 | event_log_path = f"file://{local_path}" 17 | application_id = self.find_oldest_id(local_path) 18 | return self.load_json_records(event_log_path, application_id) 19 | 20 | # tag::load[] 21 | def load_json_records(self, event_log_path, application_id): 22 | print(f"Loading {application_id}") 23 | full_log_path = f"{event_log_path}/{application_id}" 24 | df = self.session.read.json(full_log_path) 25 | special_events = df.filter( 26 | (df["Event"] == "SparkListenerExecutorAdded") 27 | | (df["Event"] == "SparkListenerJobEnd") 28 | ) 29 | special_events.show() 30 | 31 | # end::load[] 32 | -------------------------------------------------------------------------------- /accelerators/gluten_env_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if we gluten and gluten UDFs present 4 | GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so 5 | NATIVE_LIB_DIR=$(pwd)/../native/src/ 6 | NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" 7 | GLUTEN_HOME=incubator-gluten 8 | source /etc/lsb-release 9 | if [ -n "$GLUTEN_JAR_PATH" ]; then 10 | GLUTEN_EXISTS="true" 11 | GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ 12 | --conf spark.memory.offHeap.enabled=true \ 13 | --conf spark.memory.offHeap.size=5g \ 14 | --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ 15 | --jars ${GLUTEN_JAR_PATH}" 16 | fi 17 | if [ -f "${NATIVE_LIB_PATH}" ]; then 18 | if [ "$GLUTEN_EXISTS" == "true" ]; then 19 | GLUTEN_UDF_EXISTS="true" 20 | GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ 21 | --conf spark.jars=${GLUTEN_JAR_PATH} \ 22 | --conf spark.gluten.loadLibFromJar=true \ 23 | --files ${NATIVE_LIB_PATH} \ 24 | --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" 25 | fi 26 | fi 27 | SPARK_EXTRA=GLUTEN_SPARK_EXTRA 28 | 29 | export SPARK_EXTRA 30 | export GLUTEN_UDF_EXISTS 31 | export GLUTEN_EXISTS 32 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple tests for our CustomPipeline demo pipeline stage 3 | */ 4 | package com.highperformancespark.examples.ml 5 | 6 | import org.apache.spark.sql.Dataset 7 | 8 | import com.holdenkarau.spark.testing.DataFrameSuiteBase 9 | import org.scalatest.funsuite.AnyFunSuite 10 | 11 | case class TestRow(id: Int, inputColumn: String) 12 | 13 | class CustomPipelineSuite extends AnyFunSuite with DataFrameSuiteBase { 14 | val d = List( 15 | TestRow(0, "a"), 16 | TestRow(1, "b"), 17 | TestRow(2, "c"), 18 | TestRow(3, "a"), 19 | TestRow(4, "a"), 20 | TestRow(5, "c") 21 | ) 22 | 23 | test("test spark context") { 24 | val session = spark 25 | val rdd = session.sparkContext.parallelize(1 to 10) 26 | assert(rdd.sum === 55) 27 | } 28 | 29 | test("simple indexer test") { 30 | val session = spark 31 | import session.implicits._ 32 | val ds: Dataset[TestRow] = session.createDataset(d) 33 | val indexer = new SimpleIndexer() 34 | indexer.setInputCol("inputColumn") 35 | indexer.setOutputCol("categoryIndex") 36 | val model = indexer.fit(ds) 37 | val predicted = model.transform(ds) 38 | assert(predicted.columns.contains("categoryIndex")) 39 | predicted.show() 40 | } 41 | } -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.objects; 2 | 3 | import java.io.Serializable; 4 | 5 | public class JavaPandas implements Serializable { 6 | private String name; 7 | private String zip; 8 | private int pandaSize; 9 | private int age; 10 | 11 | /** 12 | * @param name name of panda 13 | * @param zip zip code 14 | * @param pandaSize size of panda in KG 15 | * @param age age of panda 16 | */ 17 | public JavaPandas(String name, String zip, int pandaSize, int age) { 18 | this.name = name; 19 | this.zip = zip; 20 | this.pandaSize = pandaSize; 21 | this.age = age; 22 | } 23 | 24 | public String getName() { 25 | return name; 26 | } 27 | 28 | public void setName(String name) { 29 | this.name = name; 30 | } 31 | 32 | public String getZip() { 33 | return zip; 34 | } 35 | 36 | public void setZip(String zip) { 37 | this.zip = zip; 38 | } 39 | 40 | public int getPandaSize() { 41 | return pandaSize; 42 | } 43 | 44 | public void setPandaSize(int pandaSize) { 45 | this.pandaSize = pandaSize; 46 | } 47 | 48 | public int getAge() { 49 | return age; 50 | } 51 | 52 | public void setAge(int age) { 53 | this.age = age; 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Basic tests for our MLlib examples 3 | */ 4 | package com.highperformancespark.examples.mllib 5 | 6 | import org.apache.spark.mllib.linalg.{Vector => SparkVector} 7 | 8 | import com.highperformancespark.examples.dataframe.RawPanda 9 | import com.holdenkarau.spark.testing._ 10 | import org.scalatest.funsuite.AnyFunSuite 11 | 12 | class GoldilocksMLlibSuite extends AnyFunSuite with SharedSparkContext { 13 | val rps = List( 14 | RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)), 15 | RawPanda(2L, "94110", "giant", false, Array(0.0, 3.0)), 16 | RawPanda(3L, "94110", "giant", true, Array(0.0, 2.0))) 17 | 18 | test("boolean to double") { 19 | assert(1.0 === GoldilocksMLlib.booleanToDouble(true)) 20 | assert(0.0 === GoldilocksMLlib.booleanToDouble(false)) 21 | } 22 | 23 | test("encoding") { 24 | val input = sc.parallelize(rps) 25 | val points = GoldilocksMLlib.toLabeledPointDense(input) 26 | assert(points.count() == 3) 27 | assert(points.filter(_.label != 0.0).count() == 2) 28 | } 29 | 30 | test("lookup table") { 31 | val input = sc.parallelize(List("hi", "bye", "coffee", "hi")) 32 | val table = GoldilocksMLlib.createLabelLookup(input) 33 | assert(table.size == 3) 34 | } 35 | 36 | } 37 | -------------------------------------------------------------------------------- /python/examples/test_dual_write_new.py: -------------------------------------------------------------------------------- 1 | import os 2 | import tempfile 3 | 4 | # tag::test[] 5 | import unittest 6 | from pyspark.sql import SparkSession 7 | from pyspark.sql.functions import current_timestamp 8 | from pyspark.sql.types import Row 9 | from pyspark.testing.utils import assertDataFrameEqual 10 | from .dual_write import DualWriteExample 11 | 12 | 13 | class DualWriteTest(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | cls.spark = SparkSession.builder.appName( 17 | "Testing PySpark Example" 18 | ).getOrCreate() 19 | 20 | @classmethod 21 | def tearDownClass(cls): 22 | cls.spark.stop() 23 | 24 | def test_always_passes(self): 25 | self.assertTrue(True) 26 | 27 | def test_actual_dual_write(self): 28 | tempdir = tempfile.mkdtemp() 29 | p1 = os.path.join(tempdir, "data1") 30 | p2 = os.path.join(tempdir, "data2") 31 | df = self.spark.createDataFrame([Row("timbit"), Row("farted")], ["names"]) 32 | combined = df.withColumn("times", current_timestamp()) 33 | DualWriteExample().do_write(combined, p1, p2) 34 | df1 = self.spark.read.format("parquet").load(p1) 35 | df2 = self.spark.read.format("parquet").load(p2) 36 | assertDataFrameEqual(df2.select("times"), df1, 0.1) 37 | 38 | 39 | # end::test[] 40 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/JavaInterop.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples; 2 | 3 | import scala.reflect.*; 4 | import scala.Tuple2; 5 | 6 | import org.apache.spark.rdd.RDD; 7 | import org.apache.spark.api.java.JavaRDD; 8 | import org.apache.spark.api.java.JavaPairRDD; 9 | import org.apache.spark.api.java.JavaSparkContext; 10 | 11 | import java.util.HashMap; 12 | import java.util.Map; 13 | 14 | import static org.apache.spark.sql.functions.*; 15 | 16 | public class JavaInterop { 17 | 18 | //tag::realClassTag[] 19 | public static JavaPairRDD wrapPairRDD( 20 | RDD> rdd) { 21 | // Construct the class tags 22 | ClassTag strCt = ClassTag$.MODULE$.apply(String.class); 23 | ClassTag longCt = ClassTag$.MODULE$.apply(scala.Long.class); 24 | return new JavaPairRDD(rdd, strCt, longCt); 25 | } 26 | //end::realClassTag[] 27 | 28 | //tag::fakeClassTag[] 29 | public static JavaPairRDD wrapPairRDDFakeCt( 30 | RDD> rdd) { 31 | // Construct the class tags by casting AnyRef - this would be more commonly done 32 | // with generic or templated code where we can't explicitly construct the correct 33 | // class tag as using fake class tags may result in degraded performance. 34 | ClassTag fake = ClassTag$.MODULE$.AnyRef(); 35 | return new JavaPairRDD(rdd, fake, fake); 36 | } 37 | //end::fakeClassTag[] 38 | } 39 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Tests that we filter out bad pandas. 3 | */ 4 | package com.highperformancespark.examples.tools 5 | 6 | import com.highperformancespark.examples.dataframe.RawPanda 7 | import com.holdenkarau.spark.testing._ 8 | import org.scalatest.funsuite.AnyFunSuite 9 | 10 | class FilterInvalidPandasSuite extends AnyFunSuite with SharedSparkContext { 11 | test("simple filter") { 12 | val invalidPandas = List(1L, 2L) 13 | val inputPandas = List( 14 | RawPanda(1L, "94110", "giant", true, Array(0.0)), 15 | RawPanda(3L, "94110", "giant", true, Array(0.0))) 16 | val input = sc.parallelize(inputPandas) 17 | val result1 = 18 | FilterInvalidPandas.filterInvalidPandas(sc, invalidPandas, input) 19 | val result2 = 20 | FilterInvalidPandas.filterInvalidPandasWithLogs(sc, invalidPandas, input) 21 | assert(result1.collect() === result2.collect()) 22 | assert(result1.count() === 1) 23 | } 24 | 25 | test("alt log") { 26 | val invalidPandas = List(1L, 2L) 27 | val inputPandas = List( 28 | RawPanda(1L, "94110", "giant", true, Array(0.0)), 29 | RawPanda(3L, "94110", "giant", true, Array(0.0))) 30 | val input = sc.parallelize(inputPandas) 31 | val al = new AltLog() 32 | val result1 = 33 | al.filterInvalidPandasWithLogs(sc, invalidPandas, input) 34 | assert(result1.count() === 1) 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # high-performance-spark-examples 2 | Examples for High Performance Spark 3 | 4 | We are in the progress of updata this for Spark 4 (some parts depending on external libraries like Iceberg, Comet, etc. are still 3.X) and the 2ed edition of our book! 5 | 6 | # Building 7 | 8 | Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake. 9 | 10 | # Tests 11 | 12 | The full test suite depends on having the C and Fortran components built as well as a local R installation available. 13 | 14 | The most "accuate" way of seeing how we run the tests is to look at the .github workflows 15 | 16 | # History Server 17 | 18 | The history server can be a great way to figure out what's going on. 19 | 20 | By default the history server writes to `/tmp/spark-events` so you'll need to create that directory if not setup with 21 | 22 | `mkdir -p /tmp/spark-events` 23 | 24 | The scripts for running the examples generally run with the event log enabled. 25 | 26 | You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too! 27 | 28 | e.g. 29 | 30 | `SPARK_EVENTLOG=true sbt test` 31 | 32 | If you want to run just a specific test you can run [testOnly](https://www.scala-sbt.org/1.x/docs/Testing.html) 33 | 34 | Then to view the history server you'll want to launch it using the `${SPARK_HOME}/sbin/start-history-server.sh` then you [can go to your local history server](http://localhost:18080/) 35 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Test that the accumulator example computes stuff. 3 | */ 4 | package com.highperformancespark.examples.transformations 5 | 6 | import scala.collection.immutable.HashSet 7 | 8 | import com.highperformancespark.examples.dataframe.RawPanda 9 | import com.holdenkarau.spark.testing._ 10 | import org.scalatest.funsuite.AnyFunSuite 11 | 12 | class AccumulatorsTest extends AnyFunSuite with SharedSparkContext { 13 | test("accumulator max should function") { 14 | val input = sc.parallelize(1.to(100)).map(x => 15 | RawPanda(1L, "1", "red", true, Array(x.toDouble))) 16 | val (_, max) = Accumulators.computeMaxFuzzyNess(sc, input) 17 | assert(max === 100.0) 18 | } 19 | 20 | test("accumulator sum should function") { 21 | val input = sc.parallelize(1.to(100)).map(x => 22 | RawPanda(1L, "1", "red", true, Array(x.toDouble))) 23 | val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input) 24 | assert(sum === 5050.0) 25 | } 26 | 27 | test("accumulator unique should function") { 28 | val input1 = sc.parallelize(1 to 100).map(x => 29 | RawPanda(1L, "1", "red", true, Array(x.toDouble)) 30 | ) 31 | 32 | val input2 = sc.parallelize(1 to 100).map(x => 33 | RawPanda(2L, "2", "blude", false, Array(x.toDouble)) 34 | ) 35 | 36 | val set = Accumulators.uniquePandas(sc, input1 ++ input2) 37 | assert(set == HashSet(2, 1)) 38 | } 39 | } 40 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.gpu 2 | 3 | import org.apache.spark.sql.SparkSession 4 | import org.apache.spark.resource._ 5 | import org.apache.spark.resource.ResourceProfileBuilder 6 | import org.apache.spark.TaskContext 7 | 8 | object GPUResourceProfileExample { 9 | def main(args: Array[String]): Unit = { 10 | val spark = SparkSession.builder() 11 | .appName("GPUResourceProfileExample") 12 | .getOrCreate() 13 | run(spark) 14 | } 15 | 16 | def run(spark: SparkSession) = { 17 | val sc = spark.sparkContext 18 | //tag::gpuResourceProfileExample[] 19 | // Create a resource profile requesting 2 NVIDIA GPUs per executor and 1 per task 20 | val gpuResourceProfile = new ResourceProfileBuilder() 21 | .require(new ExecutorResourceRequests().resource( 22 | "gpu", 2, vendor="nvidia", 23 | discoveryScript="/opt/spark/bin/getGpusResources.sh" // See sample in Spark repo 24 | )) 25 | .require(new TaskResourceRequests().resource("gpu", 1)) 26 | .build() 27 | 28 | // Use resource profile to run on a machine with GPUs. 29 | val rdd = sc.parallelize(1 to 4, 4) 30 | .withResources(gpuResourceProfile) 31 | .map { i => 32 | // Do some special GPU stuff here my friend 33 | i 34 | } 35 | //end::gpuResourceProfileExample[] 36 | 37 | rdd.collect().foreach(println) 38 | 39 | spark.stop() 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /core/src/main/r/dapply.R: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | #tag::example[] 19 | library(SparkR) 20 | 21 | # Setup SparkContext & SQLContext 22 | sc <- sparkR.init(appName="high-performance-spark-wordcount-example") 23 | 24 | # Initialize SQLContext 25 | sqlContext <- sparkRSQL.init(sc) 26 | 27 | 28 | # Count the number of characters - note this fails on the text DF due to a bug. 29 | df <- createDataFrame (sqlContext, 30 | list(list(1L, 1, "1"), 31 | list(2L, 2, "22"), 32 | list(3L, 3, "333")), 33 | c("a", "b", "c")) 34 | resultingSchema <- structType(structField("length", "integer")) 35 | result <- dapply(df, function(row) { 36 | y <- list() 37 | y <- cbind(y, nchar(row[[3]])) 38 | }, resultingSchema) 39 | showDF(result) 40 | #end::example[] 41 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.objects; 2 | 3 | import java.io.Serializable; 4 | 5 | public class JavaPandaInfo implements Serializable { 6 | private String place; 7 | private String pandaType; 8 | private int happyPandas; 9 | private int totalPandas; 10 | 11 | /** 12 | * @param place name of place 13 | * @param pandaType type of pandas in this place 14 | * @param happyPandas number of happy pandas in this place 15 | * @param totalPandas total number of pandas in this place 16 | */ 17 | public JavaPandaInfo(String place, String pandaType, int happyPandas, int totalPandas) { 18 | this.place = place; 19 | this.pandaType = pandaType; 20 | this.happyPandas = happyPandas; 21 | this.totalPandas = totalPandas; 22 | } 23 | 24 | public String getPlace() { 25 | return place; 26 | } 27 | 28 | public void setPlace(String place) { 29 | this.place = place; 30 | } 31 | 32 | public String getPandaType() { 33 | return pandaType; 34 | } 35 | 36 | public void setPandaType(String pandaType) { 37 | this.pandaType = pandaType; 38 | } 39 | 40 | public int getHappyPandas() { 41 | return happyPandas; 42 | } 43 | 44 | public void setHappyPandas(int happyPandas) { 45 | this.happyPandas = happyPandas; 46 | } 47 | 48 | public int getTotalPandas() { 49 | return totalPandas; 50 | } 51 | 52 | public void setTotalPandas(int totalPandas) { 53 | this.totalPandas = totalPandas; 54 | } 55 | 56 | } 57 | -------------------------------------------------------------------------------- /python/tox.ini: -------------------------------------------------------------------------------- 1 | [tox] 2 | passenv = * 3 | isolated_build = True 4 | requires = tox-conda 5 | envlist = 6 | isort 7 | py310 8 | black 9 | mypy 10 | flake8 11 | 12 | skip_missing_interpeters = true 13 | 14 | [gh-actions] 15 | python = 16 | # 3.9: py39 17 | # We need a new version of PySpark w/3.10 support. 18 | 3.10: py310 19 | 20 | [testenv] 21 | setenv = 22 | DJANGO_SETTINGS_MODULE=fighthealthinsurance.settings 23 | PYTHONPATH={toxinidir} 24 | DJANGO_CONFIGURATION=Dev 25 | passenv = * 26 | extras = 27 | tests 28 | coverage 29 | deps = 30 | pytest 31 | isort==4.3.21 32 | pyspark==3.5.0 33 | flake8 34 | spark-testing-base>=0.11.1 35 | mypy 36 | -rrequirements.txt 37 | commands = 38 | pytest examples \ 39 | {posargs} 40 | allowlist_externals = pytest 41 | 42 | [testenv:isort] 43 | extras = tests 44 | skipsdist = True 45 | commands = isort --check-only --diff examples 46 | allowlist_externals = isort 47 | 48 | [testenv:black] 49 | extras = tests 50 | skipsdist = True 51 | commands = black --check examples 52 | allowlist_externals = black 53 | deps = 54 | black 55 | -rrequirements.txt 56 | 57 | [testenv:flake8] 58 | extras = tests 59 | skipsdist = True 60 | commands = flake8 --ignore=F403,E402,F401,F405,W503,E265 examples 61 | allowlist_externals = flake8 62 | 63 | [testenv:mypy] 64 | extras = tests 65 | passenv = * 66 | deps = 67 | pytest 68 | mypy 69 | -rrequirements.txt 70 | setenv = 71 | {[testenv]setenv} 72 | MYPYPATH={toxinidir} 73 | commands = 74 | mypy -m examples 75 | allowlist_externals = mypy -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Verify that generate scaling data returns results 3 | */ 4 | package com.highperformancespark.examples.tools 5 | 6 | import com.highperformancespark.examples.dataframe.RawPanda 7 | import com.holdenkarau.spark.testing._ 8 | import org.scalatest.funsuite.AnyFunSuite 9 | 10 | class GeneratescalaingDataSuite extends AnyFunSuite with SharedSparkContext { 11 | // The number of entries depends somewhat on the partition split because we 12 | // zip multiple separate RDDs so its more of a "request" 13 | test("expected num entries") { 14 | val result = GenerateScalingData.generateFullGoldilocks(sc, 10L, 20) 15 | assert(result.count() <= 10) 16 | assert(result.count() > 5) 17 | assert(result.map(_.id).distinct().count() > 1) 18 | } 19 | 20 | test("expected num entries same id") { 21 | val result = GenerateScalingData.generateGoldilocks(sc, 5L, 20) 22 | assert(result.count() <= 5) 23 | assert(result.count() >= 2) 24 | assert(result.map(_.id).distinct().count() == 1) 25 | } 26 | 27 | test("mini scale data") { 28 | val result = GenerateScalingData.generateMiniScale(sc, 20L, 1) 29 | assert(result.count() <= 20) 30 | assert(result.count() > 5) 31 | assert(result.map(_._1).distinct().count() > 1) 32 | } 33 | 34 | test("mini scale rows") { 35 | val result = GenerateScalingData.generateMiniScaleRows(sc, 20L, 1) 36 | assert(result.count() <= 20) 37 | assert(result.count() > 5) 38 | assert(result.map(_(0)).distinct().count() > 1) 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /core/src/main/perl/xt/boilerplate.t: -------------------------------------------------------------------------------- 1 | #!perl -T 2 | use 5.006; 3 | use strict; 4 | use warnings; 5 | use Test::More; 6 | 7 | plan tests => 3; 8 | 9 | sub not_in_file_ok { 10 | my ($filename, %regex) = @_; 11 | open( my $fh, '<', $filename ) 12 | or die "couldn't open $filename for reading: $!"; 13 | 14 | my %violated; 15 | 16 | while (my $line = <$fh>) { 17 | while (my ($desc, $regex) = each %regex) { 18 | if ($line =~ $regex) { 19 | push @{$violated{$desc}||=[]}, $.; 20 | } 21 | } 22 | } 23 | 24 | if (%violated) { 25 | fail("$filename contains boilerplate text"); 26 | diag "$_ appears on lines @{$violated{$_}}" for keys %violated; 27 | } else { 28 | pass("$filename contains no boilerplate text"); 29 | } 30 | } 31 | 32 | sub module_boilerplate_ok { 33 | my ($module) = @_; 34 | not_in_file_ok($module => 35 | 'the great new $MODULENAME' => qr/ - The great new /, 36 | 'boilerplate description' => qr/Quick summary of what the module/, 37 | 'stub function definition' => qr/function[12]/, 38 | ); 39 | } 40 | 41 | TODO: { 42 | local $TODO = "Need to replace the boilerplate text"; 43 | 44 | not_in_file_ok(README => 45 | "The README is used..." => qr/The README is used/, 46 | "'version information here'" => qr/to provide version information/, 47 | ); 48 | 49 | not_in_file_ok(Changes => 50 | "placeholder date/time" => qr(Date/time) 51 | ); 52 | 53 | module_boilerplate_ok('lib/HighPerformanceSpark/Examples.pm'); 54 | 55 | 56 | } 57 | 58 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.objects; 2 | 3 | import java.io.Serializable; 4 | import java.util.List; 5 | 6 | public class JavaRawPanda implements Serializable { 7 | private long id; 8 | private String zip; 9 | private String pt; 10 | private boolean happy; 11 | private List attributes; 12 | 13 | /** 14 | * @param id panda id 15 | * @param zip zip code of panda residence 16 | * @param pt Type of panda as a string 17 | * @param happy if panda is happy 18 | * @param attributes array of panada attributes 19 | */ 20 | public JavaRawPanda(long id, String zip, String pt, boolean happy, List attributes) { 21 | this.attributes = attributes; 22 | this.id = id; 23 | this.zip = zip; 24 | this.pt = pt; 25 | this.happy = happy; 26 | } 27 | 28 | public long getId() { 29 | return id; 30 | } 31 | 32 | public void setId(long id) { 33 | this.id = id; 34 | } 35 | 36 | public String getZip() { 37 | return zip; 38 | } 39 | 40 | public void setZip(String zip) { 41 | this.zip = zip; 42 | } 43 | 44 | public String getPt() { 45 | return pt; 46 | } 47 | 48 | public void setPt(String pt) { 49 | this.pt = pt; 50 | } 51 | 52 | public boolean isHappy() { 53 | return happy; 54 | } 55 | 56 | public void setHappy(boolean happy) { 57 | this.happy = happy; 58 | } 59 | 60 | public List getAttributes() { 61 | return attributes; 62 | } 63 | 64 | public void setAttributes(List attributes) { 65 | this.attributes = attributes; 66 | } 67 | } -------------------------------------------------------------------------------- /core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples; 2 | 3 | import com.holdenkarau.spark.testing.SharedJavaSparkContext; 4 | 5 | import scala.Tuple2; 6 | 7 | import org.apache.spark.rdd.RDD; 8 | import org.apache.spark.api.java.JavaRDD; 9 | import org.apache.spark.api.java.JavaPairRDD; 10 | import org.apache.spark.api.java.JavaSparkContext; 11 | 12 | import java.util.ArrayList; 13 | import java.util.Arrays; 14 | import java.util.List; 15 | 16 | import static org.junit.Assert.*; 17 | 18 | import org.junit.Test; 19 | import static org.junit.Assert.assertEquals; 20 | import static org.junit.Assert.assertTrue; 21 | 22 | public class JavaInteropTest extends SharedJavaSparkContext { 23 | 24 | @Test 25 | public void wrapPairRDDTest() { 26 | JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); 27 | JavaInterop ji = new JavaInterop(); 28 | RDD> rdd = helper.generateMiniPairRDD(); 29 | JavaPairRDD prdd = ji.wrapPairRDD(rdd); 30 | List> expected = Arrays.asList(new Tuple2("panda", 12L)); 31 | assertEquals(expected, prdd.collect()); 32 | } 33 | 34 | @Test 35 | public void wrapPairRDDFakeCtTest() { 36 | JavaInteropTestHelper helper = new JavaInteropTestHelper(sc()); 37 | JavaInterop ji = new JavaInterop(); 38 | RDD> rdd = helper.generateMiniPairRDD(); 39 | JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd); 40 | List> expected = Arrays.asList(new Tuple2("panda", 12L)); 41 | assertEquals(expected, prdd.collect()); 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.wordcount 2 | 3 | /** 4 | * What sort of big data book would this be if we didn't mention wordcount? 5 | */ 6 | import org.apache.spark.rdd._ 7 | 8 | object WordCount { 9 | // bad idea: uses group by key 10 | def badIdea(rdd: RDD[String]): RDD[(String, Int)] = { 11 | val words = rdd.flatMap(_.split(" ")) 12 | val wordPairs = words.map((_, 1)) 13 | val grouped = wordPairs.groupByKey() 14 | val wordCounts = grouped.mapValues(_.sum) 15 | wordCounts 16 | } 17 | 18 | // good idea: doesn't use group by key 19 | //tag::simpleWordCount[] 20 | def simpleWordCount(rdd: RDD[String]): RDD[(String, Int)] = { 21 | val words = rdd.flatMap(_.split(" ")) 22 | val wordPairs = words.map((_, 1)) 23 | val wordCounts = wordPairs.reduceByKey(_ + _) 24 | wordCounts 25 | } 26 | //end::simpleWordCount[] 27 | 28 | /** 29 | * Come up with word counts but filter out the illegal tokens and stop words 30 | */ 31 | //tag::wordCountStopwords[] 32 | def withStopWordsFiltered(rdd : RDD[String], illegalTokens : Array[Char], 33 | stopWords : Set[String]): RDD[(String, Int)] = { 34 | val separators = illegalTokens ++ Array[Char](' ') 35 | val tokens: RDD[String] = rdd.flatMap(_.split(separators). 36 | map(_.trim.toLowerCase)) 37 | val words = tokens.filter(token => 38 | !stopWords.contains(token) && (token.length > 0) ) 39 | val wordPairs = words.map((_, 1)) 40 | val wordCounts = wordPairs.reduceByKey(_ + _) 41 | wordCounts 42 | } 43 | //end::wordCountStopwords[] 44 | } 45 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Happy Panda Example for DataFrames. 3 | * Computes the % of happy pandas. Very contrived. 4 | */ 5 | package com.highperformancespark.examples.dataframe 6 | 7 | import scala.collection.mutable 8 | import scala.util.Random 9 | 10 | import org.apache.spark.sql.DataFrame 11 | import org.apache.spark.sql.Row 12 | import org.apache.spark.sql.{SQLContext, SparkSession} 13 | import org.apache.spark.sql.types._ 14 | 15 | import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo 16 | import com.highperformancespark.examples.dataframe.HappyPandas.Pandas 17 | import com.holdenkarau.spark.testing._ 18 | import org.scalatest.funsuite.AnyFunSuite 19 | import org.scalatest.matchers.should.Matchers._ 20 | 21 | case class ExtraMagic( 22 | place: String, 23 | pandaType: String, 24 | happyPandas: Integer, 25 | totalPandas: Integer, 26 | extraInfo: Integer) 27 | 28 | 29 | class PandaPlaceFilterPushdown extends AnyFunSuite with DataFrameSuiteBase { 30 | 31 | override def appName: String = "pandaPlaceFilterPushdown" 32 | 33 | val basicList = List( 34 | ExtraMagic("a", "b", 1, 2, 3), 35 | ExtraMagic("toronto", "b", 1, 2, 3), 36 | ) 37 | 38 | test("simpleFilterTest") { 39 | val sqlCtx = sqlContext 40 | import sqlCtx.implicits._ 41 | val inputDF = sqlCtx.createDataFrame(basicList) 42 | val restrictedDF = inputDF.select($"place", $"pandaType", $"happyPandas", $"totalPandas") 43 | val switched = inputDF.as[PandaInfo] 44 | // Note if we write the filter with functional syntax it does not push down. 45 | val filtered = switched.filter($"place" === "a") 46 | assert(filtered.count() === 1) 47 | } 48 | } 49 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Test our simple JNI 3 | */ 4 | package com.highperformancespark.examples.ffi 5 | 6 | import com.holdenkarau.spark.testing._ 7 | import org.scalacheck.Arbitrary 8 | import org.scalacheck.Gen 9 | import org.scalacheck.Prop.forAll 10 | import org.scalatest.funsuite.AnyFunSuite 11 | import org.scalatest.matchers.should.Matchers._ 12 | import org.scalatestplus.scalacheck.Checkers 13 | 14 | class NativeExampleSuite extends AnyFunSuite 15 | with SharedSparkContext with Checkers with RDDComparisons { 16 | 17 | test("local sum") { 18 | val input = Array(1, 2, 3) 19 | val sumMagic = new SumJNI() 20 | val result = sumMagic.sum(input) 21 | val expected = 6 22 | assert(result === expected) 23 | } 24 | 25 | test("super simple test") { 26 | val input = sc.parallelize(List(("hi", Array(1, 2, 3)))) 27 | val result = NativeExample.jniSum(input).collect() 28 | val expected = List(("hi", 6)) 29 | assert(result === expected) 30 | } 31 | 32 | test("native call should find sum correctly") { 33 | val property = forAll( 34 | RDDGenerator.genRDD[(String, Array[Int])](sc)( 35 | Arbitrary.arbitrary[(String, Array[Int])])) { 36 | rdd => 37 | val expected = rdd.mapValues(_.sum) 38 | val result = NativeExample.jniSum(rdd) 39 | compareRDDWithOrder(expected, result).isEmpty 40 | } 41 | check(property) 42 | } 43 | 44 | test("JNA support") { 45 | val input = Array(1, 2, 3) 46 | assert(6 === SumJNA.sum(input, input.size)) 47 | } 48 | 49 | test("JNA Fortran support") { 50 | val input = Array(1, 2, 3) 51 | assert(6 === SumFJNA.easySum(input.size, input)) 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Happy Panda Example for DataFrames. 3 | * Computes the % of happy pandas. Very contrived. 4 | */ 5 | package com.highperformancespark.examples.dataframe 6 | 7 | import scala.collection.mutable 8 | import scala.util.Random 9 | 10 | import org.apache.spark.SparkConf 11 | import org.apache.spark.sql._ 12 | import org.apache.spark.sql.execution.ExplainMode 13 | import org.apache.spark.sql.types.IntegerType 14 | import org.apache.spark.sql.functions.{lower, rand} 15 | import org.apache.spark.sql.types._ 16 | 17 | import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo 18 | import com.highperformancespark.examples.dataframe.HappyPandas.Pandas 19 | import com.holdenkarau.spark.testing._ 20 | import org.scalatest.funsuite.AnyFunSuite 21 | import org.scalatest.matchers.should.Matchers._ 22 | 23 | class SQLExtensionTest extends AnyFunSuite with ScalaDataFrameSuiteBase { 24 | 25 | val rawPandaList = List( 26 | RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)), 27 | RawPanda(11L, "94110", "red", true, Array(1.0, 0.9))) 28 | 29 | override def conf: SparkConf = { 30 | val initialConf = super.conf 31 | initialConf.set( 32 | "spark.sql.extensions", 33 | "com.highperformancespark.examples.dataframe.SQLExtension") 34 | } 35 | 36 | def explainToString(df: DataFrame): String = { 37 | df.queryExecution.explainString(ExplainMode.fromString("extended")) 38 | } 39 | 40 | test("Magic") { 41 | import spark.implicits._ 42 | val inputDF = spark.createDataFrame(rawPandaList) 43 | spark.sql("DROP TABLE IF EXISTS farts") 44 | inputDF.write.saveAsTable("farts") 45 | val testDF = spark.read.table("farts") 46 | val explained: String = explainToString(testDF.select($"zip".cast(IntegerType))) 47 | explained should include ("isnotnull(zip#") 48 | } 49 | } 50 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Example UDFs 3 | */ 4 | package com.highperformancespark.examples.dataframe 5 | 6 | import org.apache.spark.sql._ 7 | import org.apache.spark.sql.expressions._ 8 | import org.apache.spark.sql.types._ 9 | 10 | object UDFs { 11 | //tag::setupUDFs[] 12 | def setupUDFs(sqlCtx: SQLContext) = { 13 | sqlCtx.udf.register("strLen", (s: String) => s.length()) 14 | } 15 | //end::setupUDFs[] 16 | 17 | //tag::setupUDAFs[] 18 | def setupUDAFs(sqlCtx: SQLContext) = { 19 | class Avg extends UserDefinedAggregateFunction { 20 | // Input type 21 | def inputSchema: org.apache.spark.sql.types.StructType = 22 | StructType(StructField("value", DoubleType) :: Nil) 23 | 24 | def bufferSchema: StructType = StructType( 25 | StructField("count", LongType) :: 26 | StructField("sum", DoubleType) :: Nil 27 | ) 28 | 29 | // Return type 30 | def dataType: DataType = DoubleType 31 | 32 | def deterministic: Boolean = true 33 | 34 | def initialize(buffer: MutableAggregationBuffer): Unit = { 35 | buffer(0) = 0L 36 | buffer(1) = 0.0 37 | } 38 | 39 | def update(buffer: MutableAggregationBuffer,input: Row): Unit = { 40 | buffer(0) = buffer.getAs[Long](0) + 1 41 | buffer(1) = buffer.getAs[Double](1) + input.getAs[Double](0) 42 | } 43 | 44 | def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { 45 | buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0) 46 | buffer1(1) = buffer1.getAs[Double](1) + buffer2.getAs[Double](1) 47 | } 48 | 49 | def evaluate(buffer: Row): Any = { 50 | buffer.getDouble(1) / buffer.getLong(0) 51 | } 52 | } 53 | // Optionally register 54 | val avg = new Avg 55 | sqlCtx.udf.register("ourAvg", avg) 56 | } 57 | //end::setupUDAFs[] 58 | } 59 | -------------------------------------------------------------------------------- /python/examples/pandera_ex.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.session import SparkSession 2 | 3 | # tag::pandera_imports[] 4 | import pandera.pyspark as pa 5 | import pyspark.sql.types as T 6 | 7 | # end::pandera_imports[] 8 | 9 | 10 | # tag::simple_data_schema[] 11 | class ProjectDataSchema(pa.DataFrameModel): 12 | # Note str_length is currently broken :/ 13 | creator: T.StringType() = pa.Field(str_length={"min_value": 1}) 14 | projectname: T.StringType() = pa.Field() 15 | stars: T.IntegerType() = pa.Field(ge=0) 16 | 17 | 18 | # end::simple_data_schema[] 19 | 20 | 21 | # tag::gender_data[] 22 | class GenderData(pa.DataFrameModel): 23 | MaleBonusPercent: T.DoubleType() = pa.Field(nullable=True, le=5) 24 | FemaleBonusPercent: T.DoubleType() = pa.Field(nullable=True) 25 | CompanyNumber: T.IntegerType() = pa.Field() 26 | 27 | 28 | # end::gender_data[] 29 | 30 | if __name__ == "__main__": 31 | spark = SparkSession.builder.master("local[4]").getOrCreate() 32 | # Make sure to make 33 | # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" 34 | # available as ./data/2021 35 | uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) 36 | 37 | # tag::validate_gender_data[] 38 | validated_df = GenderData(uk_df) 39 | # Print out the errors. You may wish to exit with an error condition. 40 | if validated_df.pandera.errors != {}: 41 | print(validated_df.pandera.errors) 42 | # sys.exit(1) 43 | # end::validate_gender_data[] 44 | 45 | # tag::validate_project_data[] 46 | project_data = spark.read.csv("./data/project.csv", header=True, inferSchema=True) 47 | validated_df = ProjectDataSchema(project_data) 48 | # Print out the errors. You may wish to exit with an error condition. 49 | if validated_df.pandera.errors != {}: 50 | print(validated_df.pandera.errors) 51 | # sys.exit(1) 52 | # end::validate_project_data[] 53 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.class 2 | *.log 3 | 4 | # sbt specific 5 | .cache 6 | .history 7 | .lib/ 8 | dist/* 9 | target/ 10 | lib_managed/ 11 | src_managed/ 12 | project/boot/ 13 | project/plugins/project/ 14 | .bsp 15 | 16 | # Scala-IDE specific 17 | .scala_dependencies 18 | .worksheet 19 | .idea/ 20 | 21 | # emacs stuff 22 | \#*\# 23 | \.\#* 24 | *~ 25 | sbt/*launch*.jar 26 | 27 | # VSCode specific 28 | .vscode 29 | .history 30 | 31 | # Metals 32 | .metals 33 | .bloop 34 | metals.sbt 35 | 36 | # python 37 | *.pyc 38 | .tox 39 | .bsp 40 | 41 | # Distribution / packaging 42 | .Python 43 | build/ 44 | develop-eggs/ 45 | dist/ 46 | downloads/ 47 | eggs/ 48 | .eggs/ 49 | lib/ 50 | lib64/ 51 | parts/ 52 | sdist/ 53 | var/ 54 | wheels/ 55 | share/python-wheels/ 56 | *.egg-info/ 57 | .installed.cfg 58 | *.egg 59 | MANIFEST 60 | # scala stuff 61 | .metals 62 | 63 | # native 64 | *.o 65 | *.so 66 | *.so.0.0.0 67 | *.so.0 68 | 69 | # Spark files 70 | *.tgz 71 | iceberg-spark-runtime-*.jar 72 | spark-*-bin-hadoop*/ 73 | 74 | # Warehouse 75 | spark-warehouse/ 76 | warehouse/ 77 | metastore_db/ 78 | 79 | # Misc internal stuff 80 | sql/*.sql.out 81 | python/examples/*.py.out 82 | data/fetched/* 83 | spark_expectations_sample_rules.json 84 | 85 | # more python 86 | pyspark_venv.tar.gz 87 | pyspark_venv/ 88 | 89 | # accel stuff 90 | accelerators/*.jar 91 | accelerators/arrow-datafusion-comet 92 | # ignore gluten 93 | gluten 94 | gluten*.jar 95 | spark-3*hadoop*/ 96 | spark-3*hadoop*.tgz 97 | accelerators/incubator-gluten 98 | # ignore the temporary myapp from the dockerbuild 99 | myapp.tar 100 | # ignore glutten 101 | incubator-glutten/* 102 | # ignore nested build file. 103 | project/build.sbt 104 | coursier 105 | # Magic file we use for build tracking 106 | oldhash 107 | # ignore ipynb checkpoints 108 | .ipynb_checkpoints/ 109 | 110 | # ignore accel 111 | incubator-gluten/ 112 | -------------------------------------------------------------------------------- /python/examples/SQLLineage.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import DataFrame, Row 2 | from pyspark.sql.session import SparkSession 3 | import sys 4 | 5 | global df 6 | global sc 7 | global rdd 8 | global spark 9 | 10 | 11 | """ 12 | >>> df = rdd.toDF() 13 | >>> df2 = cutLineage(df) 14 | >>> df.head() == df2.head() 15 | True 16 | >>> df.schema == df2.schema 17 | True 18 | """ 19 | 20 | 21 | # tag::cutLineage[] 22 | def cutLineage(df): 23 | """ 24 | Cut the lineage of a DataFrame - used for iterative algorithms 25 | 26 | .. Note: This uses internal members and may break between versions 27 | >>> df = rdd.toDF() 28 | >>> cutDf = cutLineage(df) 29 | >>> cutDf.count() 30 | 3 31 | """ 32 | jRDD = df._jdf.toJavaRDD() 33 | jSchema = df._jdf.schema() 34 | jRDD.cache() 35 | session = df.sparkSession 36 | javaSparkSession = session._jsparkSession 37 | newJavaDF = javaSparkSession.createDataFrame(jRDD, jSchema) 38 | newDF = DataFrame(newJavaDF, session) 39 | return newDF 40 | 41 | 42 | # end::cutLineage[] 43 | 44 | 45 | def _setupTest(): 46 | globs = globals() 47 | spark = SparkSession.builder.master("local[4]").getOrCreate() 48 | sc = spark._sc 49 | sc.setLogLevel("ERROR") 50 | globs["sc"] = sc 51 | globs["spark"] = spark 52 | globs["rdd"] = sc.parallelize( 53 | [ 54 | Row(field1=1, field2="row1"), 55 | Row(field1=2, field2="row2"), 56 | Row(field1=3, field2="row3"), 57 | ] 58 | ) 59 | return globs 60 | 61 | 62 | def _test(): 63 | """ 64 | Run the tests. 65 | """ 66 | import doctest 67 | 68 | globs = _setupTest() 69 | (failure_count, test_count) = doctest.testmod( 70 | globs=globs, optionflags=doctest.ELLIPSIS 71 | ) 72 | globs["sc"].stop() 73 | if failure_count: 74 | exit(-1) 75 | 76 | 77 | if __name__ == "__main__": 78 | _test() 79 | # Hack to support running in nose 80 | elif sys.stdout != sys.__stdout__: 81 | _setupTest() 82 | -------------------------------------------------------------------------------- /python/examples/udf.py: -------------------------------------------------------------------------------- 1 | # This script triggers a number of different PySpark errors 2 | 3 | from pyspark.sql.session import SparkSession 4 | from pyspark.sql.functions import pandas_udf, udf 5 | from typing import Iterator 6 | import sys 7 | import pandas as pd 8 | 9 | global sc 10 | 11 | 12 | # We need the session before we can use @udf 13 | spark = SparkSession.builder.master("local[4]").getOrCreate() 14 | 15 | 16 | # tag::simple_udf[] 17 | @udf("long") 18 | def classic_add1(e: int) -> int: 19 | return e + 1 20 | 21 | 22 | # end::simple_udf[] 23 | 24 | 25 | # tag::agg_new_udf[] 26 | @pandas_udf("long") 27 | def pandas_sum(s: pd.Series) -> int: 28 | return s.sum() 29 | 30 | 31 | # end::agg_new_udf[] 32 | 33 | 34 | # tag::new_udf[] 35 | @pandas_udf("long") 36 | def pandas_add1(s: pd.Series) -> pd.Series: 37 | # Vectorized operation on all of the elems in series at once 38 | return s + 1 39 | 40 | 41 | # end::new_udf[] 42 | 43 | 44 | # tag::complex_udf[] 45 | @pandas_udf("long") 46 | def pandas_nested_add1(d: pd.DataFrame) -> pd.Series: 47 | # Takes a struct and returns the age elem + 1, if we wanted 48 | # to update (e.g. return struct) we could update d and return it instead. 49 | return d["age"] + 1 50 | 51 | 52 | # end::complex_udf[] 53 | 54 | 55 | # tag::batches_of_batches_udf[] 56 | @pandas_udf("long") 57 | def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: 58 | my_db_connection = None # Expensive setup logic goes here 59 | for s in t: 60 | # Do something with your setup logic 61 | if my_db_connection is None: 62 | # Vectorized operation on all of the elems in series at once 63 | yield s + 1 64 | 65 | 66 | # end::batches_of_batches_udf[] 67 | 68 | 69 | if __name__ == "__main__": 70 | # Make sure to make 71 | # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" 72 | # available as ./data/2021 73 | uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) 74 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.highperformancespark.examples.ffi 18 | 19 | import org.apache.spark.SparkContext 20 | import org.apache.spark.SparkFiles 21 | import org.apache.spark.rdd._ 22 | 23 | object PipeExample { 24 | //tag::pipeExample[] 25 | def lookupUserPRS(sc: SparkContext, input: RDD[Int]): RDD[(Int, List[String])] = { 26 | // Copy our script to the worker nodes with sc.addFile 27 | // Add file requires absolute paths 28 | val distScriptName = "ghinfo.pl" 29 | val userDir = System.getProperty("user.dir") 30 | val localScript = s"${userDir}/src/main/perl/${distScriptName}" 31 | val addedFile = sc.addFile(localScript) 32 | 33 | // Pass enviroment variables to our worker 34 | val enviromentVars = Map("user" -> "apache", "repo" -> "spark") 35 | val result = input.map(x => x.toString) 36 | .pipe(SparkFiles.get(distScriptName), enviromentVars) 37 | // Parse the results 38 | result.map{record => 39 | val elems: Array[String] = record.split(" ") 40 | (elems(0).toInt, elems.slice(1, elems.size).sorted.distinct.toList) 41 | } 42 | } 43 | //end::pipeExample[] 44 | } 45 | -------------------------------------------------------------------------------- /core/src/main/r/wc.R: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | args <- commandArgs(trailing = TRUE) 19 | 20 | if (length(args) != 1) { 21 | print("Usage: wc.R ") 22 | q("no") 23 | } 24 | 25 | fileName <- args(1) 26 | 27 | #tag::example[] 28 | 29 | library(SparkR) 30 | 31 | # Setup SparkContext & SQLContext 32 | sc <- sparkR.init(appName="high-performance-spark-wordcount-example") 33 | 34 | # Initialize SQLContext 35 | sqlContext <- sparkRSQL.init(sc) 36 | 37 | # Load some simple data 38 | 39 | df <- read.text(fileName) 40 | 41 | # Split the words 42 | words <- selectExpr(df, "split(value, \" \") as words") 43 | 44 | # Compute the count 45 | explodedWords <- select(words, alias(explode(words$words), "words")) 46 | wc <- agg(groupBy(explodedWords, "words"), "words" = "count") 47 | 48 | 49 | # Attempting to push an array back fails 50 | # resultingSchema <- structType(structField("words", "array")) 51 | # words <- dapply(df, function(line) { 52 | # y <- list() 53 | # y[[1]] <- strsplit(line[[1]], " ") 54 | # }, resultingSchema) 55 | # Also attempting even the identity transformation on a DF from read.text fails 56 | # in Spark 2.0-preview (although works fine on other DFs). 57 | 58 | # Display the result 59 | showDF(wc) 60 | #end::example[] 61 | -------------------------------------------------------------------------------- /accelerators/setup_gluten_spark34.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | mkdir -p /tmp/spark-events 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 5 | ACCEL_JARS=${SCRIPT_DIR} 6 | SPARK_MAJOR_VERSION=3.4 7 | SCALA_VERSION=${SCALA_VERSION:-"2.12"} 8 | 9 | set -ex 10 | 11 | # Note: this does not work on Ubuntu 23, only on 22 12 | # You might get something like: 13 | # # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 14 | 15 | 16 | SPARK_VERSION=3.4.2 17 | SPARK_MAJOR=3.4 18 | HADOOP_VERSION=3 19 | SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" 20 | SPARK_FILE="${SPARK_DIR}.tgz" 21 | 22 | export SPARK_MAJOR 23 | export SPARK_VERSION 24 | 25 | source setup_gluten_deps.sh 26 | 27 | cd .. 28 | source /etc/lsb-release 29 | # Pre-baked only 30 | if [ "$DISTRIB_RELEASE" == "20.04" ]; then 31 | source ./env_setup.sh 32 | cd "${SCRIPT_DIR}" 33 | 34 | GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" 35 | GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" 36 | 37 | if [ ! -f "${GLUTEN_JAR_PATH}" ]; then 38 | wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" || unset GLUTEN_JAR_PATH 39 | fi 40 | 41 | fi 42 | # Rather than if/else we fall through to build if wget fails because major version is not supported. 43 | if [ -z "$GLUTEN_JAR_PATH" ]; then 44 | #tag::build_gluten[] 45 | if [ ! -d incubator-gluten ]; then 46 | git clone https://github.com/apache/incubator-gluten.git 47 | fi 48 | cd incubator-gluten 49 | sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON 50 | mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests 51 | GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" 52 | #end::build_gluten[] 53 | fi 54 | 55 | export GLUTEN_JAR_PATH 56 | 57 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Simple tests for our SimpleNaiveBayes demo pipeline stage 3 | */ 4 | package com.highperformancespark.examples.ml 5 | 6 | import org.apache.spark.ml._ 7 | import org.apache.spark.ml.feature._ 8 | import org.apache.spark.ml.param._ 9 | import org.apache.spark.sql.DataFrame 10 | import org.apache.spark.sql.Dataset 11 | import org.apache.spark.sql.Row 12 | import org.apache.spark.sql.SQLContext 13 | import org.apache.spark.sql.types._ 14 | 15 | import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo 16 | import com.highperformancespark.examples.dataframe.HappyPandas.Pandas 17 | import com.holdenkarau.spark.testing._ 18 | import org.scalatest.funsuite.AnyFunSuite 19 | import org.scalatest.matchers.should.Matchers._ 20 | 21 | case class MiniPanda(happy: Double, fuzzy: Double, old: Double) 22 | 23 | class SimpleNaiveBayesSuite extends AnyFunSuite with DataFrameSuiteBase { 24 | val miniPandasList = List( 25 | MiniPanda(1.0, 1.0, 1.0), 26 | MiniPanda(1.0, 1.0, 0.0), 27 | MiniPanda(1.0, 1.0, 0.0), 28 | MiniPanda(0.0, 0.0, 1.0), 29 | MiniPanda(0.0, 0.0, 0.0)) 30 | 31 | test("simple sanity test") { 32 | val session = spark 33 | import session.implicits._ 34 | val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList) 35 | val assembler = new VectorAssembler() 36 | assembler.setInputCols(Array("fuzzy", "old")) 37 | assembler.setOutputCol("magical_features") 38 | val snb = new SimpleNaiveBayes() 39 | snb.setLabelCol("happy") 40 | snb.setFeaturesCol("magical_features") 41 | val pipeline = new Pipeline().setStages(Array(assembler, snb)) 42 | val model = pipeline.fit(ds) 43 | val test = ds.select("fuzzy", "old") 44 | val predicted = model.transform(test) 45 | assert(predicted.count() === miniPandasList.size) 46 | val nbModel = model.stages(1).asInstanceOf[SimpleNaiveBayesModel] 47 | assert(nbModel.getFeaturesCol === "magical_features") 48 | assert(nbModel.copy(ParamMap.empty).getFeaturesCol === "magical_features") 49 | } 50 | } 51 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.errors 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.rdd.RDD 5 | 6 | object Throws { 7 | def throwInner(sc: SparkContext) = { 8 | //tag::throwInner1[] 9 | val data = sc.parallelize(List(1, 2, 3)) 10 | // Will throw an exception when forced to evaluate 11 | val transform1 = data.map(x => x/0) 12 | val transform2 = transform1.map(x => x + 1) 13 | transform2.collect() // Forces evaluation 14 | //end::throwInner1[] 15 | } 16 | 17 | def throwOuter(sc: SparkContext) = { 18 | //tag::throwOuter1[] 19 | val data = sc.parallelize(List(1, 2, 3)) 20 | val transform1 = data.map(x => x + 1) 21 | // Will throw an exception when forced to evaluate 22 | val transform2 = transform1.map(x => x/0) 23 | transform2.collect() // Forces evaluation 24 | //end::throwOuter1[] 25 | } 26 | 27 | //tag::badFunctions[] 28 | def add1(x: Int): Int = { 29 | x + 1 30 | } 31 | 32 | def divZero(x: Int): Int = { 33 | x / 0 34 | } 35 | //end::badFunctions[] 36 | 37 | //tag::badEx3[] 38 | def throwInner2(sc: SparkContext) = { 39 | val data = sc.parallelize(List(1, 2, 3)) 40 | // Will throw an exception when forced to evaluate 41 | val transform1 = data.map(divZero) 42 | val transform2 = transform1.map(add1) 43 | transform2.collect() // Forces evaluation 44 | } 45 | 46 | def throwOuter2(sc: SparkContext) = { 47 | val data = sc.parallelize(List(1, 2, 3)) 48 | val transform1 = data.map(add1) 49 | // Will throw an exception when forced to evaluate 50 | val transform2 = transform1.map(divZero) 51 | transform2.collect() // Forces evaluation 52 | } 53 | //end::badEx3 54 | 55 | def nonExistentInput(sc: SparkContext) = { 56 | //tag::nonExistentInput[] 57 | val input = sc.textFile("file:///doesnotexist.txt") 58 | val data = input.map(x => x.toInt) 59 | val transform = data.map(x => x + 1) 60 | transform.collect() // Forces evaluation 61 | //end::nonExistentInput[] 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala: -------------------------------------------------------------------------------- 1 | import scala.reflect.ClassTag 2 | import scala.util.Random 3 | 4 | import org.apache.spark.SparkContext 5 | import org.apache.spark.rdd.RDD 6 | 7 | /** 8 | * Sample our production data to be able to use it for tests 9 | */ 10 | object SampleData { 11 | /** 12 | * Sample the input down to k % for usage in tests 13 | */ 14 | def sampleInput[T](rdd: RDD[T]): RDD[T] = { 15 | // tag::randomSampleInput[] 16 | rdd.sample(withReplacement=false, fraction=0.1) 17 | // end::randomSampleInput[] 18 | } 19 | 20 | /** 21 | * Construct a stratified sample 22 | */ 23 | def stratifiedSample(rdd: RDD[(String, Array[Double])]): 24 | RDD[(String, Array[Double])] = { 25 | // tag::stratifiedSample[] 26 | // 5% of the red pandas, and 50% of the giant pandas 27 | val stratas = Map("red" -> 0.05, "giant" -> 0.50) 28 | rdd.sampleByKey(withReplacement=false, fractions = stratas) 29 | // end::stratifiedSample[] 30 | } 31 | 32 | /** 33 | * Custom random sample with RNG. This is intended as an example of how 34 | * to save setup overhead. 35 | */ 36 | def slowSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = { 37 | rdd.flatMap{x => val r = new Random() 38 | if (r.nextInt(10) == 0) { 39 | Some(x) 40 | } else { 41 | None 42 | }} 43 | } 44 | 45 | /** 46 | * Custom random sample with RNG. This is intended as an example of how to 47 | * save setup overhead. 48 | */ 49 | def customSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = { 50 | // tag::mapPartitions[] 51 | rdd.mapPartitions{itr => 52 | // Only create once RNG per partitions 53 | val r = new Random() 54 | itr.filter(x => r.nextInt(10) == 0) 55 | } 56 | // end::mapPartitions[] 57 | } 58 | 59 | // tag::broadcast[] 60 | class LazyPrng { 61 | @transient lazy val r = new Random() 62 | } 63 | def customSampleBroadcast[T: ClassTag](sc: SparkContext, rdd: RDD[T]): RDD[T]= { 64 | val bcastprng = sc.broadcast(new LazyPrng()) 65 | rdd.filter(x => bcastprng.value.r.nextInt(10) == 0) 66 | } 67 | // end::broadcast[] 68 | } 69 | -------------------------------------------------------------------------------- /conf/log4j.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Licensed to the Apache Software Foundation (ASF) under one or more 3 | # contributor license agreements. See the NOTICE file distributed with 4 | # this work for additional information regarding copyright ownership. 5 | # The ASF licenses this file to You under the Apache License, Version 2.0 6 | # (the "License"); you may not use this file except in compliance with 7 | # the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Set everything to be logged to the console 19 | log4j.rootCategory=ERROR, console 20 | log4j.appender.console=org.apache.log4j.ConsoleAppender 21 | log4j.appender.console.target=System.err 22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 24 | 25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the 26 | # log level for this class is used to overwrite the root logger's log level, so that 27 | # the user can have different defaults for the shell and regular Spark apps. 28 | log4j.logger.org.apache.spark.repl.Main=ERROR 29 | 30 | # Settings to quiet third party logs that are too verbose 31 | log4j.logger.org.spark-project.jetty=ERROR 32 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR 33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 35 | log4j.logger.org.apache.parquet=ERROR 36 | log4j.logger.parquet=ERROR 37 | 38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support 39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL 40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR 41 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.wordcount 2 | 3 | 4 | import com.holdenkarau.spark.testing.SharedSparkContext 5 | import org.scalatest.funsuite.AnyFunSuite 6 | 7 | class WordCountTest extends AnyFunSuite with SharedSparkContext { 8 | test("word count with Stop Words Removed"){ 9 | val wordRDD = sc.parallelize(Seq( 10 | "How happy was the panda? You ask.", 11 | "Panda is the most happy panda in all the #$!?ing land!")) 12 | 13 | val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he") 14 | val illegalTokens: Array[Char] = "#$%?!.".toCharArray 15 | 16 | val wordCounts = WordCount.withStopWordsFiltered( 17 | wordRDD, illegalTokens, stopWords) 18 | val wordCountsAsMap = wordCounts.collectAsMap() 19 | assert(!wordCountsAsMap.contains("the")) 20 | assert(!wordCountsAsMap.contains("?")) 21 | assert(!wordCountsAsMap.contains("#$!?ing")) 22 | assert(wordCountsAsMap.contains("ing")) 23 | assert(wordCountsAsMap.get("panda").get.equals(3)) 24 | } 25 | 26 | test("word count with simple counting") { 27 | val wordRDD = sc.parallelize( 28 | Seq( 29 | "a b c d", 30 | "b c d e" 31 | ) 32 | ) 33 | val wordCounts = WordCount.simpleWordCount(wordRDD) 34 | 35 | val wordCountsAsMap = wordCounts.collectAsMap() 36 | 37 | for (character <- 'a' to 'e') { 38 | assert(wordCountsAsMap.contains(character.toString)) 39 | } 40 | for (character <- 'b' to 'd') { 41 | assert(wordCountsAsMap.get(character.toString).get == 2) 42 | } 43 | } 44 | 45 | test("word count with bad idea") { 46 | val wordRDD = sc.parallelize( 47 | Seq( 48 | "a b c d", 49 | "b c d e" 50 | ) 51 | ) 52 | val wordCounts = WordCount.badIdea(wordRDD) 53 | 54 | val wordCountsAsMap = wordCounts.collectAsMap() 55 | 56 | for (character <- 'a' to 'e') { 57 | assert(wordCountsAsMap.contains(character.toString)) 58 | } 59 | for (character <- 'b' to 'd') { 60 | assert(wordCountsAsMap.get(character.toString).get == 2) 61 | } 62 | } 63 | } 64 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.tools 2 | 3 | import scala.collection.immutable.HashSet 4 | 5 | import org.apache.spark._ 6 | import org.apache.spark.rdd.RDD 7 | 8 | import com.highperformancespark.examples.dataframe.RawPanda 9 | //tag::loggerImport[] 10 | import org.apache.logging.log4j.LogManager 11 | //end::loggerImport[] 12 | 13 | object FilterInvalidPandas { 14 | 15 | def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long], 16 | input: RDD[RawPanda]) = { 17 | //tag::broadcast[] 18 | val invalid: HashSet[Long] = HashSet() ++ invalidPandas 19 | val invalidBroadcast = sc.broadcast(invalid) 20 | input.filter{panda => !invalidBroadcast.value.contains(panda.id)} 21 | //end::broadcast[] 22 | } 23 | 24 | def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], 25 | input: RDD[RawPanda]) = { 26 | //tag::broadcastAndLog[] 27 | val invalid: HashSet[Long] = HashSet() ++ invalidPandas 28 | val invalidBroadcast = sc.broadcast(invalid) 29 | def keepPanda(pandaId: Long) = { 30 | val logger = LogManager.getLogger("fart based logs") 31 | if (invalidBroadcast.value.contains(pandaId)) { 32 | logger.debug("hi") 33 | false 34 | } else { 35 | true 36 | } 37 | } 38 | input.filter{panda => keepPanda(panda.id)} 39 | //end::broadcastAndLog[] 40 | } 41 | } 42 | 43 | //tag::broadcastAndLogClass[] 44 | class AltLog() { 45 | lazy val logger = LogManager.getLogger("fart based logs") 46 | def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long], 47 | input: RDD[RawPanda]) = { 48 | val invalid: HashSet[Long] = HashSet() ++ invalidPandas 49 | val invalidBroadcast = sc.broadcast(invalid) 50 | def keepPanda(pandaId: Long) = { 51 | val logger = LogManager.getLogger("fart based logs") 52 | if (invalidBroadcast.value.contains(pandaId)) { 53 | logger.debug("hi") 54 | false 55 | } else { 56 | true 57 | } 58 | } 59 | input.filter{panda => keepPanda(panda.id)} 60 | } 61 | } 62 | //end::broadcastAndLogClass[] 63 | -------------------------------------------------------------------------------- /core/src/main/perl/README: -------------------------------------------------------------------------------- 1 | HighPerformanceSpark-Examples 2 | 3 | The README is used to introduce the module and provide instructions on 4 | how to install the module, any machine dependencies it may have (for 5 | example C compilers and installed libraries) and any other information 6 | that should be provided before the module is installed. 7 | 8 | A README file is required for CPAN modules since CPAN extracts the README 9 | file from a module distribution so that people browsing the archive 10 | can use it to get an idea of the module's uses. It is usually a good idea 11 | to provide version information here so that people can decide whether 12 | fixes for the module are worth downloading. 13 | 14 | 15 | INSTALLATION 16 | 17 | To install this module, run the following commands: 18 | 19 | perl Makefile.PL 20 | make 21 | make test 22 | make install 23 | 24 | SUPPORT AND DOCUMENTATION 25 | 26 | After installing, you can find documentation for this module with the 27 | perldoc command. 28 | 29 | perldoc HighPerformanceSpark::Examples 30 | 31 | You can also look for information at: 32 | 33 | RT, CPAN's request tracker (report bugs here) 34 | http://rt.cpan.org/NoAuth/Bugs.html?Dist=HighPerformanceSpark-Examples 35 | 36 | AnnoCPAN, Annotated CPAN documentation 37 | http://annocpan.org/dist/HighPerformanceSpark-Examples 38 | 39 | CPAN Ratings 40 | http://cpanratings.perl.org/d/HighPerformanceSpark-Examples 41 | 42 | Search CPAN 43 | http://search.cpan.org/dist/HighPerformanceSpark-Examples/ 44 | 45 | 46 | LICENSE AND COPYRIGHT 47 | 48 | Copyright (C) 2016 Holden Karau And Rachel Warren 49 | 50 | Licensed under the Apache License, Version 2.0 (the "License"); 51 | you may not use this file except in compliance with the License. 52 | You may obtain a copy of the License at 53 | 54 | L 55 | 56 | Unless required by applicable law or agreed to in writing, software 57 | distributed under the License is distributed on an "AS IS" BASIS, 58 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 59 | See the License for the specific language governing permissions and 60 | limitations under the License. 61 | 62 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back: -------------------------------------------------------------------------------- 1 | /** 2 | * A sample mixing relational & functional transformations with Datasets. 3 | */ 4 | package com.highperformancespark.examples.dataframe 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.catalyst.expressions.aggregate._ 10 | import org.apache.spark.sql.expressions._ 11 | import org.apache.spark.sql.functions._ 12 | import org.apache.spark.sql.types._ 13 | // Additional imports for using HiveContext 14 | import org.apache.spark.sql.hive._ 15 | import org.apache.spark.sql.hive.thriftserver._ 16 | 17 | class MixedDataset(sqlCtx: SQLContext) { 18 | import sqlCtx.implicits._ 19 | 20 | /** 21 | * A sample function on a Dataset of RawPandas. 22 | * This is contrived, since our reduction could also be done with SQL aggregates, but 23 | * we can see the flexibility of being able to specify arbitrary Scala code. 24 | */ 25 | def happyPandaSums(ds: Dataset[RawPanda]): Double = { 26 | ds.toDF().filter($"happy" === true).as[RawPanda]. 27 | select($"attributes"(0).as[Double]). 28 | reduce((x, y) => x + y) 29 | } 30 | 31 | /** 32 | * Functional map + Dataset, sums the positive attributes for the pandas 33 | */ 34 | def funMap(ds: Dataset[RawPanda]): Dataset[Double] = { 35 | ds.map{rp => rp.attributes.filter(_ > 0).sum} 36 | } 37 | 38 | /** 39 | * Illustrate how we make typed queries, using some of the float properties to produce boolean 40 | * values. 41 | */ 42 | def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = { 43 | ds.select($"attributes"(0).as[Double]) 44 | } 45 | 46 | /** 47 | * Illustrate converting a Dataset to an RDD 48 | */ 49 | def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = { 50 | ds.rdd 51 | } 52 | 53 | /** 54 | * Illustrate converting a Dataset to a DataFrame 55 | */ 56 | def toDF(ds: Dataset[RawPanda]): DataFrame = { 57 | ds.toDF() 58 | } 59 | 60 | /** 61 | * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what 62 | * is expected by the Dataset this fails fast. 63 | */ 64 | def fromDF(df: DataFrame): Dataset[RawPanda] = { 65 | df.as[RawPanda] 66 | } 67 | } 68 | -------------------------------------------------------------------------------- /run_sql_examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -ex 3 | set -o pipefail 4 | 5 | source env_setup.sh 6 | 7 | # You might want to set SPARK_EXTRA to do things like log more info 8 | 9 | function run_example () { 10 | local sql_file="$1" 11 | local extra="$2" 12 | EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions 13 | if [ -n "$EXTRA_EXTENSIONS" ]; then 14 | EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS" 15 | fi 16 | # shellcheck disable=SC2046,SC2086 17 | ${SPARK_HOME}/bin/spark-sql --master local[5] \ 18 | --conf spark.eventLog.enabled=true \ 19 | --conf spark.sql.extensions=$EXTENSIONS \ 20 | --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ 21 | --conf spark.sql.catalog.spark_catalog.type=hive \ 22 | --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ 23 | --conf spark.sql.catalog.local.type=hadoop \ 24 | --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ 25 | ${extra} ${SPARK_EXTRA} \ 26 | $(cat "${sql_file}.conf" || echo "") \ 27 | --name "${sql_file}" \ 28 | -f "${sql_file}" 2>&1 | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" 29 | } 30 | 31 | 32 | # If you want to look at them 33 | # ${SPARK_PATH}/sbin/start-history-server.sh 34 | 35 | if [ $# -eq 1 ]; then 36 | if [[ "$1" != *"gluten_only"* ]]; then 37 | run_example "sql/$1" 38 | else 39 | echo "Processing gluten ${sql_file}" 40 | # shellcheck disable=SC2046 41 | run_example "$sql_file" 42 | fi 43 | else 44 | # For each SQL 45 | for sql_file in sql/*.sql; do 46 | if [[ "$sql_file" != *"_only"* ]]; then 47 | echo "Processing ${sql_file}" 48 | # shellcheck disable=SC2046 49 | run_example "$sql_file" 50 | elif [[ "$sql_file" != *"gluten_only"* && "$GLUTEN_EXISTS" == "true" ]]; then 51 | echo "Processing gluten ${sql_file}" 52 | # shellcheck disable=SC2046 53 | run_example "$sql_file" 54 | elif [[ "$sql_file" != *"gluten_udf_only"* && "$GLUTEN_UDF_EXISTS" == "true" ]]; then 55 | echo "Processing gluten UDF ${sql_file}" 56 | # shellcheck disable=SC2046 57 | run_example "$sql_file" 58 | else 59 | echo "Skipping $sql_file since we did not find gluten and this is restricted example." 60 | fi 61 | done 62 | fi 63 | -------------------------------------------------------------------------------- /native/src/CMakeLists.txt: -------------------------------------------------------------------------------- 1 | ################################################################ 2 | # A minimal CMake file that is compatible with sbt-jni # 3 | # # 4 | # All settings required by sbt-jni have been marked so, please # 5 | # add/modify/remove settings to build your specific library. # 6 | ################################################################ 7 | 8 | cmake_minimum_required(VERSION 3.12) 9 | 10 | option(SBT "Set if invoked from sbt-jni" OFF) 11 | 12 | # Define project and related variables 13 | # (required by sbt-jni) please use semantic versioning 14 | # 15 | project (high-performance-spark) 16 | enable_language(Fortran) 17 | set(PROJECT_VERSION_MAJOR 0) 18 | set(PROJECT_VERSION_MINOR 0) 19 | set(PROJECT_VERSION_PATCH 0) 20 | 21 | set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) 22 | 23 | #tag::velox[] 24 | set (GLUTEN_LIB_NAME ${PROJECT_NAME}-gluten-${PROJECT_VERSION_MAJOR}) 25 | # For gluten+velox, you can leave out if not using gluten 26 | set(GLUTEN_HOME ../../gluten) 27 | set(CMAKE_FIND_DEBUG_MODE TRUE) 28 | find_library(VELOX_LIBRARY NAMES velox HINTS 29 | ${GLUTEN_HOME}/cpp/build/releases NO_DEFAULT_PATH) 30 | # End gluten specific 31 | 32 | if(VELOX_LIBRARY) 33 | file(GLOB GLUTEN_UDF_FILES 34 | "./c/gluten/*.cpp") 35 | add_library(${GLUTEN_LIB_NAME} SHARED ${GLUTEN_UDF_FILES}) 36 | target_include_directories(${GLUTEN_LIB_NAME} PRIVATE ${GLUTEN_HOME}/cpp ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) 37 | target_link_libraries(${GLUTEN_LIB_NAME} PRIVATE ${VELOX_LIBRARY}) 38 | else() 39 | message(WARNING "Velox library not found. Specific path not added.") 40 | endif() 41 | #end::velox[] 42 | 43 | # Setup JNI 44 | find_package(JNI REQUIRED) 45 | if (JNI_FOUND) 46 | message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}") 47 | endif() 48 | 49 | # Include directories 50 | include_directories(.) 51 | include_directories(include) 52 | include_directories(${JNI_INCLUDE_DIRS}) 53 | 54 | # Sources 55 | file(GLOB LIB_SRC 56 | "*.c" 57 | "*.f95" 58 | "*.f*" 59 | "*.cc" 60 | "*.cpp" 61 | "./c/*.c" 62 | "./c/*.cpp" 63 | "./fortran/*.f95" 64 | "./fortran/*.f*" 65 | ) 66 | 67 | # Setup installation targets 68 | # (required by sbt-jni) major version should always be appended to library name 69 | # 70 | add_library(${LIB_NAME} SHARED ${LIB_SRC}) 71 | install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .) 72 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.dataframe; 2 | 3 | import org.apache.spark.sql.Row; 4 | import org.apache.spark.sql.SQLContext; 5 | import org.apache.spark.sql.expressions.MutableAggregationBuffer; 6 | import org.apache.spark.sql.expressions.UserDefinedAggregateFunction; 7 | import org.apache.spark.sql.types.*; 8 | 9 | public class JavaUDFs { 10 | 11 | public static void setupUDFs(SQLContext sqlContext) { 12 | //tag::basicUDF[] 13 | sqlContext.udf() 14 | .register("strlen", 15 | (String s) -> s.length(), DataTypes.StringType); 16 | //end::basicUDF[] 17 | } 18 | 19 | public static void setupUDAFs(SQLContext sqlContext) { 20 | 21 | class Avg extends UserDefinedAggregateFunction { 22 | 23 | @Override 24 | public StructType inputSchema() { 25 | StructType inputSchema = 26 | new StructType(new StructField[]{new StructField("value", DataTypes.DoubleType, true, Metadata.empty())}); 27 | return inputSchema; 28 | } 29 | 30 | @Override 31 | public StructType bufferSchema() { 32 | StructType bufferSchema = 33 | new StructType(new StructField[]{ 34 | new StructField("count", DataTypes.LongType, true, Metadata.empty()), 35 | new StructField("sum", DataTypes.DoubleType, true, Metadata.empty()) 36 | }); 37 | 38 | return bufferSchema; 39 | } 40 | 41 | @Override 42 | public DataType dataType() { 43 | return DataTypes.DoubleType; 44 | } 45 | 46 | @Override 47 | public boolean deterministic() { 48 | return true; 49 | } 50 | 51 | @Override 52 | public void initialize(MutableAggregationBuffer buffer) { 53 | buffer.update(0, 0L); 54 | buffer.update(1, 0.0); 55 | } 56 | 57 | @Override 58 | public void update(MutableAggregationBuffer buffer, Row input) { 59 | buffer.update(0, buffer.getLong(0) + 1); 60 | buffer.update(1, buffer.getDouble(1) + input.getDouble(0)); 61 | } 62 | 63 | @Override 64 | public void merge(MutableAggregationBuffer buffer1, Row buffer2) { 65 | buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0)); 66 | buffer1.update(1, buffer1.getDouble(1) + buffer2.getDouble(1)); 67 | } 68 | 69 | @Override 70 | public Object evaluate(Row buffer) { 71 | return buffer.getDouble(1) / buffer.getLong(0); 72 | } 73 | } 74 | 75 | Avg average = new Avg(); 76 | sqlContext.udf().register("ourAvg", average); 77 | } 78 | } 79 | -------------------------------------------------------------------------------- /env_setup.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | # Download Spark and iceberg if not present 6 | SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} 7 | SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.3"} 8 | SCALA_VERSION=${SCALA_VERSION:-"2.13"} 9 | HADOOP_VERSION="3" 10 | SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" 11 | SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" 12 | if [ "$SCALA_VERSION" = "2.13" ]; then 13 | SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3-scala2.13.tgz" 14 | SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13" 15 | fi 16 | ICEBERG_VERSION=${ICEBERG_VERSION:-"1.9.2"} 17 | if [ ! -f "${SPARK_FILE}" ]; then 18 | SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" 19 | SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" 20 | if command -v axel &> /dev/null 21 | then 22 | (axel --quiet "$SPARK_DIST_URL" || axel --quiet "$SPARK_ARCHIVE_DIST_URL") & 23 | else 24 | (wget --quiet "$SPARK_DIST_URL" || wget --quiet "$SPARK_ARCHIVE_DIST_URL") & 25 | fi 26 | fi 27 | # Download Icberg if not present 28 | ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" 29 | if [ ! -f "${ICEBERG_FILE}" ]; then 30 | wget --quiet "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & 31 | fi 32 | wait 33 | sleep 1 34 | # Setup the env 35 | if [ ! -d "${SPARK_PATH}" ]; then 36 | tar -xf "${SPARK_FILE}" 37 | fi 38 | 39 | SPARK_HOME="${SPARK_PATH}" 40 | export SPARK_HOME 41 | 42 | if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then 43 | # Delete the old JAR first. 44 | rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete." 45 | cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}" 46 | fi 47 | 48 | # Set up for running pyspark and friends 49 | export PATH="${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}" 50 | 51 | # Make sure we have a history directory 52 | mkdir -p /tmp/spark-events 53 | 54 | mkdir -p ./data/fetched/ 55 | if [ ! -f ./data/fetched/2021 ]; then 56 | wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 57 | fi 58 | if [ ! -f ./data/fetched/2022 ]; then 59 | wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022 60 | fi 61 | if [ ! -f ./data/fetched/2023 ]; then 62 | wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023 63 | fi 64 | 65 | -------------------------------------------------------------------------------- /run_pyspark_examples.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # shellcheck disable=SC1091,SC2034 3 | 4 | source env_setup.sh 5 | 6 | set -ex 7 | 8 | set -o pipefail 9 | 10 | #tag::package_venv[] 11 | if [ ! -d pyspark_venv ]; then 12 | python -m venv pyspark_venv 13 | fi 14 | 15 | source pyspark_venv/bin/activate 16 | pip install -r ./python/requirements.txt 17 | 18 | if [ ! -f pyspark_venv.tar.gz ]; then 19 | venv-pack -o pyspark_venv.tar.gz 20 | fi 21 | 22 | 23 | # Set in local and client mode where the driver uses the Python present 24 | # (requires that you have activated the venv as we did above) 25 | PYSPARK_DRIVER_PYTHON=python 26 | export PYSPARK_DRIVER_PYTHON 27 | export PYTHON_PATH=./environment/bin/python 28 | #end::package_venv[] 29 | 30 | # Some hack for our json magic 31 | cat se*.json > spark_expectations_sample_rules.json 32 | 33 | function check_fail () { 34 | local ex="$1" 35 | local code="$2" 36 | if [ -f "${ex}.fail" ]; then 37 | echo "ok"; 38 | else 39 | exit "$code" 40 | fi 41 | } 42 | 43 | EXAMPLE_JAR="./core/target/scala-2.13/core-assembly-0.1.0-SNAPSHOT.jar" 44 | 45 | pip install setuptools 46 | 47 | # Iceberg JAR not yet available for Spark 4. 48 | if [ ! -f "${EXAMPLE_JAR}" ]; then 49 | rm ./core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala # temp hack no merge in Spark 3. 50 | sbt core/assembly -DsparkVersion="${SPARK_VERSION}" 51 | fi 52 | 53 | if [ ! -f "${EXAMPLE_JAR}" ]; then 54 | echo "Can't find sample jar?!?" 55 | exit 1 56 | fi 57 | 58 | function run_example () { 59 | local ex="$1" 60 | # shellcheck disable=SC2046 61 | spark-submit \ 62 | --master local[5] \ 63 | --conf spark.eventLog.enabled=true \ 64 | --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ 65 | --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ 66 | --conf spark.sql.catalog.spark_catalog.type=hive \ 67 | --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ 68 | --conf spark.sql.catalog.local.type=hadoop \ 69 | --archives pyspark_venv.tar.gz#environment \ 70 | --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ 71 | $(cat "${ex}.conf" || echo "") \ 72 | --name "${ex}" \ 73 | --jars "${EXAMPLE_JAR}" \ 74 | "${ex}" 2>&1 | tee -a "${ex}.out" || check_fail "$ex" $? 75 | } 76 | 77 | if [ $# -eq 1 ]; then 78 | run_example "python/examples/$1" 79 | else 80 | for ex in python/examples/*.py; do 81 | if [[ "$ex" =~ test.* ]]; then 82 | echo "Skipping ex $ex as it is a test and covered by our tests." 83 | else 84 | echo "Running $ex" 85 | run_example "$ex" 86 | fi 87 | done 88 | fi 89 | -------------------------------------------------------------------------------- /native/src/c/gluten/GlutenUDF.cpp: -------------------------------------------------------------------------------- 1 | // Filename MyUDF.cpp 2 | 3 | #include 4 | #include 5 | #include 6 | 7 | 8 | namespace { 9 | using namespace facebook::velox; 10 | 11 | template 12 | class PlusConstantFunction : public exec::VectorFunction { 13 | public: 14 | explicit PlusConstantFunction(int32_t addition) : addition_(addition) {} 15 | 16 | void apply( 17 | const SelectivityVector& rows, 18 | std::vector& args, 19 | const TypePtr& /* outputType */, 20 | exec::EvalCtx& context, 21 | VectorPtr& result) const override { 22 | using nativeType = typename TypeTraits::NativeType; 23 | VELOX_CHECK_EQ(args.size(), 1); 24 | 25 | auto& arg = args[0]; 26 | 27 | // The argument may be flat or constant. 28 | VELOX_CHECK(arg->isFlatEncoding() || arg->isConstantEncoding()); 29 | 30 | BaseVector::ensureWritable(rows, createScalarType(), context.pool(), result); 31 | 32 | auto* flatResult = result->asFlatVector(); 33 | auto* rawResult = flatResult->mutableRawValues(); 34 | 35 | flatResult->clearNulls(rows); 36 | 37 | if (arg->isConstantEncoding()) { 38 | auto value = arg->as>()->valueAt(0); 39 | rows.applyToSelected([&](auto row) { rawResult[row] = value + addition_; }); 40 | } else { 41 | auto* rawInput = arg->as>()->rawValues(); 42 | 43 | rows.applyToSelected([&](auto row) { rawResult[row] = rawInput[row] + addition_; }); 44 | } 45 | } 46 | 47 | private: 48 | const int32_t addition_; 49 | }; 50 | 51 | static std::vector> integerSignatures() { 52 | // integer -> integer 53 | return {exec::FunctionSignatureBuilder().returnType("integer").argumentType("integer").build()}; 54 | } 55 | 56 | static std::vector> bigintSignatures() { 57 | // bigint -> bigint 58 | return {exec::FunctionSignatureBuilder().returnType("bigint").argumentType("bigint").build()}; 59 | } 60 | 61 | } // namespace 62 | 63 | const int kNumMyUdf = 2; 64 | gluten::UdfEntry myUdf[kNumMyUdf] = {{"myudf1", "integer"}, {"myudf2", "bigint"}}; 65 | 66 | DEFINE_GET_NUM_UDF { 67 | return kNumMyUdf; 68 | } 69 | 70 | DEFINE_GET_UDF_ENTRIES { 71 | for (auto i = 0; i < kNumMyUdf; ++i) { 72 | udfEntries[i] = myUdf[i]; 73 | } 74 | } 75 | 76 | DEFINE_REGISTER_UDF { 77 | facebook::velox::exec::registerVectorFunction( 78 | "myudf1", integerSignatures(), std::make_unique>(5)); 79 | facebook::velox::exec::registerVectorFunction( 80 | "myudf2", bigintSignatures(), std::make_unique>(5)); 81 | std::cout << "registered myudf1, myudf2" << std::endl; 82 | } 83 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Streaming Pandas Example with the old DStream APIs. 3 | */ 4 | package com.highperformancespark.examples.streaming 5 | 6 | import scala.reflect.ClassTag 7 | 8 | import org.apache.spark._ 9 | import org.apache.spark.rdd.RDD 10 | import org.apache.spark.streaming._ 11 | import org.apache.spark.streaming.dstream._ 12 | 13 | import org.apache.hadoop.io.LongWritable 14 | import org.apache.hadoop.io.Text 15 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat 16 | //end::DStreamImports[] 17 | 18 | object DStreamExamples { 19 | def makeStreamingContext(sc: SparkContext) = { 20 | //tag::ssc[] 21 | val batchInterval = Seconds(1) 22 | new StreamingContext(sc, batchInterval) 23 | //end::ssc[] 24 | } 25 | 26 | def makeRecoverableStreamingContext(sc: SparkContext, checkpointDir: String) = { 27 | //tag::sscRecover[] 28 | def createStreamingContext(): StreamingContext = { 29 | val batchInterval = Seconds(1) 30 | val ssc = new StreamingContext(sc, batchInterval) 31 | ssc.checkpoint(checkpointDir) 32 | // Then create whatever stream is required 33 | // And whatever mappings need to go on those streams 34 | ssc 35 | } 36 | val ssc = StreamingContext.getOrCreate(checkpointDir, 37 | createStreamingContext _) 38 | // Do whatever work needs to be done regardless of state 39 | // Start context and run 40 | ssc.start() 41 | //end::sscRecover[] 42 | } 43 | 44 | def fileAPIExample(ssc: StreamingContext, path: String): 45 | DStream[(Long, String)] = { 46 | //tag::file[] 47 | // You don't need to write the types of the InputDStream but it for illustration 48 | val inputDStream: InputDStream[(LongWritable, Text)] = 49 | ssc.fileStream[LongWritable, Text, TextInputFormat](path) 50 | // Convert the hadoop types to native JVM types for simplicity 51 | def convert(input: (LongWritable, Text)) = { 52 | (input._1.get(), input._2.toString()) 53 | } 54 | val input: DStream[(Long, String)] = inputDStream.map(convert) 55 | //end::file[] 56 | input 57 | } 58 | 59 | def repartition(dstream: DStream[_]) = { 60 | //tag::repartition[] 61 | dstream.repartition(20) 62 | //end::repartition[] 63 | } 64 | 65 | //tag::repartitionWithTransform[] 66 | def dStreamRepartition[A: ClassTag](dstream: DStream[A]): DStream[A] = { 67 | dstream.transform{rdd => rdd.repartition(20)} 68 | } 69 | //end::repartitionWithTransform[] 70 | 71 | def simpleTextOut(target: String, dstream: DStream[_]) = { 72 | //tag::simpleOut[] 73 | dstream.saveAsTextFiles(target) 74 | //end::simpleOut[] 75 | } 76 | 77 | def foreachSaveSequence(target: String, dstream: DStream[(Long, String)]) = { 78 | //tag::foreachSave[] 79 | dstream.foreachRDD{(rdd, window) => 80 | rdd.saveAsSequenceFile(target + window) 81 | } 82 | //end::foreachSave[] 83 | } 84 | } 85 | -------------------------------------------------------------------------------- /core/src/main/perl/lib/HighPerformanceSpark/Examples.pm: -------------------------------------------------------------------------------- 1 | package HighPerformanceSpark::Examples; 2 | 3 | use 5.006; 4 | use strict; 5 | use warnings; 6 | 7 | =head1 NAME 8 | 9 | HighPerformanceSpark::Examples - The great new HighPerformanceSpark::Examples! 10 | 11 | =head1 VERSION 12 | 13 | Version 0.01 14 | 15 | =cut 16 | 17 | our $VERSION = '0.01'; 18 | 19 | 20 | =head1 SYNOPSIS 21 | 22 | Quick summary of what the module does. 23 | 24 | Perhaps a little code snippet. 25 | 26 | use HighPerformanceSpark::Examples; 27 | 28 | my $foo = HighPerformanceSpark::Examples->new(); 29 | ... 30 | 31 | =head1 EXPORT 32 | 33 | A list of functions that can be exported. You can delete this section 34 | if you don't export anything, such as for a purely object-oriented module. 35 | 36 | =head1 SUBROUTINES/METHODS 37 | 38 | =head2 function1 39 | 40 | =cut 41 | 42 | sub function1 { 43 | } 44 | 45 | =head2 function2 46 | 47 | =cut 48 | 49 | sub function2 { 50 | } 51 | 52 | =head1 AUTHOR 53 | 54 | Holden Karau And Rachel Warren, C<< >> 55 | 56 | =head1 BUGS 57 | 58 | Please report any bugs or feature requests to C, or through 59 | the web interface at L. I will be notified, and then you'll 60 | automatically be notified of progress on your bug as I make changes. 61 | 62 | 63 | 64 | 65 | =head1 SUPPORT 66 | 67 | You can find documentation for this module with the perldoc command. 68 | 69 | perldoc HighPerformanceSpark::Examples 70 | 71 | 72 | You can also look for information at: 73 | 74 | =over 4 75 | 76 | =item * RT: CPAN's request tracker (report bugs here) 77 | 78 | L 79 | 80 | =item * AnnoCPAN: Annotated CPAN documentation 81 | 82 | L 83 | 84 | =item * CPAN Ratings 85 | 86 | L 87 | 88 | =item * Search CPAN 89 | 90 | L 91 | 92 | =back 93 | 94 | 95 | =head1 ACKNOWLEDGEMENTS 96 | 97 | 98 | =head1 LICENSE AND COPYRIGHT 99 | 100 | Copyright 2016 Holden Karau And Rachel Warren. 101 | 102 | Licensed under the Apache License, Version 2.0 (the "License"); 103 | you may not use this file except in compliance with the License. 104 | You may obtain a copy of the License at 105 | 106 | L 107 | 108 | Unless required by applicable law or agreed to in writing, software 109 | distributed under the License is distributed on an "AS IS" BASIS, 110 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 111 | See the License for the specific language governing permissions and 112 | limitations under the License. 113 | 114 | 115 | =cut 116 | 117 | 1; # End of HighPerformanceSpark::Examples 118 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala: -------------------------------------------------------------------------------- 1 | 2 | package com.highperformancespark.examples.goldilocks 3 | 4 | import scala.reflect.ClassTag 5 | 6 | import org.apache.spark.rdd.RDD 7 | 8 | import com.holdenkarau.spark.testing.SharedSparkContext 9 | import org.scalatest.funsuite.AnyFunSuite 10 | 11 | 12 | class SortingTests extends AnyFunSuite with SharedSparkContext { 13 | 14 | test("Test Sort by two keys"){ 15 | 16 | val sortedData: Array[((Int, Char), Double)] = Range(0, 15).flatMap( x => 17 | Range(50, 100).map(i => (( x, i.toChar), Math.random())) 18 | ).toArray 19 | 20 | val unsorted = scramble(sc.parallelize(sortedData),2) 21 | val sortedSimple: Array[((Int, Char), Double)] = unsorted.sortByKey().collect() 22 | 23 | assert(sortedSimple sameElements sortedData) 24 | } 25 | 26 | test("Panda Secondary Sort"){ 27 | val pandaData: Array[(String, StreetAddress, Int, Double)] = Array( 28 | ("Morris", StreetAddress("Accra","Grove", 52 ), 84440, 0.0), 29 | ("Joe", StreetAddress("Accra","Grove", 52 ), 94440, 0.0), 30 | ("Kobe", StreetAddress("Accra","Grove", 52 ), 94440, 0.0), 31 | 32 | ("Morris", StreetAddress("Albany","Grove", 52 ), 84440, 0.0), 33 | ("Joe", StreetAddress("Albany","Grove", 52 ), 94440, 0.0), 34 | ("Kobe", StreetAddress("Albany","Grove", 52 ), 94440, 0.5), 35 | ("Morris", StreetAddress("Denver","Grove", 52 ), 84440, 0.5), 36 | ("Joe", StreetAddress("LA","Grove", 52 ), 94440, 0.5), 37 | ("Kobe", StreetAddress("LA","Grove", 52 ), 94440, 0.5), 38 | ("Joe", StreetAddress("SanFransisco","Grove", 52 ), 94440, 0.5), 39 | ("Kobe", StreetAddress("SanFransisco","Grove", 52 ), 94440, 0.5), 40 | ("Joe", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), 41 | ("Kobe", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), 42 | ("Lacy", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), 43 | ("Morris", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5), 44 | ("Joe", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), 45 | ("Kobe", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), 46 | ("Lacy", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), 47 | ("Morris", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5), 48 | ("Joe", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5), 49 | ("Kobe", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5), 50 | ("Lacy", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5), 51 | ("Morris", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5) 52 | ) 53 | 54 | val unsorted = scramble(sc.parallelize(pandaData)) 55 | val pandaSort = PandaSecondarySort.secondarySort(unsorted) 56 | pandaSort.zipWithIndex().collect.foreach{ 57 | case (x, i) => assert(x == pandaData(i.toInt), "Element " + x + " is wrong") 58 | } 59 | 60 | 61 | 62 | } 63 | 64 | 65 | def scramble[T : ClassTag]( rdd : RDD[T], partitions : Int= 3) = { 66 | val wRandom = rdd.map((Math.random(), _)) 67 | val unsorted = wRandom.sortByKey(true, partitions) 68 | unsorted.values 69 | } 70 | 71 | } 72 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.tools 2 | 3 | import org.apache.spark._ 4 | import org.apache.spark.mllib.linalg.Vector 5 | import org.apache.spark.mllib.random.RandomRDDs 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.Row 8 | 9 | import com.highperformancespark.examples.dataframe.RawPanda 10 | 11 | object GenerateScalingData { 12 | /** 13 | * Generate a Goldilocks data set. We expect the zip code to follow an exponential 14 | * distribution and the data its self to be normal 15 | * 16 | * Note: May generate less than number of requested rows due to different 17 | * distribution between 18 | * 19 | * partitions and zip being computed per partition. 20 | * @param rows number of rows in the RDD (approximate) 21 | * @param size number of value elements 22 | */ 23 | def generateFullGoldilocks(sc: SparkContext, rows: Long, numCols: Int): 24 | RDD[RawPanda] = { 25 | val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows) 26 | .map(_.toInt.toString) 27 | val valuesRDD = RandomRDDs.normalVectorRDD( 28 | sc, numRows = rows, numCols = numCols) 29 | .repartition(zipRDD.partitions.size) 30 | val keyRDD = sc.parallelize(1L.to(rows), zipRDD.getNumPartitions) 31 | keyRDD.zipPartitions(zipRDD, valuesRDD){ 32 | (i1, i2, i3) => 33 | new Iterator[(Long, String, Vector)] { 34 | def hasNext: Boolean = (i1.hasNext, i2.hasNext, i3.hasNext) match { 35 | case (true, true, true) => true 36 | case (false, false, false) => false 37 | // Note: this is "unsafe" (we throw away data when one of 38 | // the partitions has run out). 39 | case _ => false 40 | } 41 | def next(): (Long, String, Vector) = (i1.next(), i2.next(), i3.next()) 42 | } 43 | }.map{case (k, z, v) => 44 | RawPanda(k, z, "giant", v(0) > 0.5, v.toArray)} 45 | } 46 | 47 | /** 48 | * Transform it down to just the data used for the benchmark 49 | */ 50 | def generateMiniScale(sc: SparkContext, rows: Long, numCols: Int): 51 | RDD[(Int, Double)] = { 52 | generateFullGoldilocks(sc, rows, numCols) 53 | .map(p => (p.zip.toInt, p.attributes(0))) 54 | } 55 | 56 | /** 57 | * Transform it down to just the data used for the benchmark 58 | */ 59 | def generateMiniScaleRows(sc: SparkContext, rows: Long, numCols: Int): 60 | RDD[Row] = { 61 | generateMiniScale(sc, rows, numCols).map{case (zip, fuzzy) => Row(zip, fuzzy)} 62 | } 63 | 64 | // tag::MAGIC_PANDA[] 65 | /** 66 | * Generate a Goldilocks data set all with the same id. 67 | * We expect the zip code to follow an exponential 68 | * distribution and the data its self to be normal. 69 | * Simplified to avoid a 3-way zip. 70 | * 71 | * Note: May generate less than number of requested rows due to 72 | * different distribution between partitions and zip being computed 73 | * per partition. 74 | */ 75 | def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int): 76 | RDD[RawPanda] = { 77 | val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000, size = rows) 78 | .map(_.toInt.toString) 79 | val valuesRDD = RandomRDDs.normalVectorRDD( 80 | sc, numRows = rows, numCols = numCols) 81 | zipRDD.zip(valuesRDD).map{case (z, v) => 82 | RawPanda(1, z, "giant", v(0) > 0.5, v.toArray) 83 | } 84 | } 85 | // end::MAGIC_PANDA[] 86 | } 87 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.goldilocks 2 | 3 | import org.apache.spark.rdd.RDD 4 | 5 | import com.holdenkarau.spark.testing.SharedSparkContext 6 | import org.scalatest.funsuite.AnyFunSuite 7 | 8 | class EvaluationTests extends AnyFunSuite with SharedSparkContext { 9 | val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0) 10 | val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex 11 | val path = "target/testResults" 12 | 13 | test("MapValues preserves Partitioning "){ 14 | val data: RDD[(Double, Int )] = sc.parallelize(keyValuePairs) 15 | // tag::MapValues[] 16 | val sortedData = data.sortByKey() 17 | val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString) 18 | assert(mapValues.partitioner.isDefined, 19 | "Using Map Values preserves partitioning") 20 | 21 | val map = sortedData.map( pair => (pair._1, pair._2.toString)) 22 | assert(map.partitioner.isEmpty, "Using map does not preserve partitioning") 23 | // end::MapValues[] 24 | } 25 | 26 | test( "Subtract Behavior "){ 27 | // tag::Subtract[] 28 | val a = Array(1, 2, 3, 4, 4, 4, 4) 29 | val b = Array(3, 4) 30 | val rddA = sc.parallelize(a) 31 | val rddB = sc.parallelize(b) 32 | val rddC = rddA.subtract(rddB) 33 | assert(rddC.count() < rddA.count() - rddB.count()) 34 | // end::Subtract[] 35 | } 36 | 37 | test( "Intersection Behavior "){ 38 | // tag::Intersect[] 39 | val a = Array(1, 2, 3, 4, 4, 4, 4) 40 | val b = Array(3, 4) 41 | val rddA = sc.parallelize(a) 42 | val rddB = sc.parallelize(b) 43 | val intersection = rddA.intersection(rddB) 44 | val subtraction = rddA.subtract(rddB) 45 | val union = intersection.union(subtraction) 46 | assert(!rddA.collect().sorted.sameElements(union.collect().sorted)) 47 | // end::Intersect[] 48 | } 49 | 50 | test("Itereative Computations "){ 51 | def rmse(rdd : RDD[(Int, Int )]) = { 52 | val n = rdd.count() 53 | math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n) 54 | } 55 | 56 | val validationSet = sc.parallelize(keyValuePairs) 57 | 58 | // tag::iterativeComp[] 59 | val testSet: Array[RDD[(Double, Int)]] = 60 | Array( 61 | validationSet.mapValues(_ + 1), 62 | validationSet.mapValues(_ + 2), 63 | validationSet) 64 | validationSet.persist() //persist since we are using this RDD several times 65 | val errors = testSet.map( rdd => { 66 | rmse(rdd.join(validationSet).values) 67 | }) 68 | // end::iterativeComp[] 69 | 70 | // the one where we didn't change anything should have the 71 | // lowest root mean squared error 72 | assert(errors.min == errors(2)) 73 | 74 | } 75 | 76 | test( "Two actions without caching ") { 77 | val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs) 78 | 79 | // tag::TwoActions[] 80 | val sorted = rddA.sortByKey() 81 | val count = sorted.count() // sorted Action 1 82 | val sample: Long = count / 10 83 | val sampled = sorted.take(sample.toInt) // sorted Action 2 84 | // end::TwoActions[] 85 | } 86 | 87 | test( "Two actions with caching "){ 88 | val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs) 89 | // tag::TwoActionsCache[] 90 | val sorted = rddA.sortByKey() 91 | sorted.persist() 92 | val count = sorted.count() // sorted Action 1 93 | val sample: Long = count / 10 94 | val sampled = sorted.take(sample.toInt) // sorted Action 2 95 | // end::TwoActionsCache[] 96 | } 97 | 98 | 99 | 100 | } 101 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Licensed to the Apache Software Foundation (ASF) under one or more 3 | * contributor license agreements. See the NOTICE file distributed with 4 | * this work for additional information regarding copyright ownership. 5 | * The ASF licenses this file to You under the Apache License, Version 2.0 6 | * (the "License"); you may not use this file except in compliance with 7 | * the License. You may obtain a copy of the License at 8 | * 9 | * http://www.apache.org/licenses/LICENSE-2.0 10 | * 11 | * Unless required by applicable law or agreed to in writing, software 12 | * distributed under the License is distributed on an "AS IS" BASIS, 13 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | * See the License for the specific language governing permissions and 15 | * limitations under the License. 16 | */ 17 | package com.highperformancespark.examples.perf 18 | 19 | import org.apache.spark.SparkConf 20 | import org.apache.spark.SparkContext 21 | import org.apache.spark.rdd._ 22 | import org.apache.spark.sql.DataFrame 23 | import org.apache.spark.sql.Dataset 24 | import org.apache.spark.sql.Row 25 | import org.apache.spark.sql.SparkSession 26 | import org.apache.spark.sql.types._ 27 | 28 | import com.highperformancespark.examples.dataframe.RawPanda 29 | import com.highperformancespark.examples.tools._ 30 | 31 | /** 32 | * A simple performance test to compare a simple sort between DataFrame, and RDD 33 | */ 34 | object SimplePerfTest { 35 | // $COVERAGE-OFF$ 36 | def main(args: Array[String]) = { 37 | val sparkConf = new SparkConf().setAppName("simple-perf-test") 38 | val sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate() 39 | val sc = sparkSession.sparkContext 40 | val scalingFactor = if (args.length > 0) args(0).toLong else 100L 41 | val size = if (args.length > 1) args(1).toInt else 50 42 | run(sc, sparkSession, scalingFactor, size) 43 | } 44 | 45 | def run(sc: SparkContext, session: SparkSession, 46 | scalingFactor: Long, size: Int) = { 47 | import session.implicits._ 48 | val inputRDD = GenerateScalingData.generateFullGoldilocks( 49 | sc, scalingFactor, size) 50 | val pairRDD = inputRDD.map(p => (p.zip.toInt, p.attributes(0))) 51 | pairRDD.cache() 52 | pairRDD.count() 53 | val rddTimeings = 1.to(10).map(x => time(testOnRDD(pairRDD))) 54 | val groupTimeings = 1.to(10).map(x => time(groupOnRDD(pairRDD))) 55 | val df = inputRDD.toDF() 56 | val inputDataFrame = df.select( 57 | df("zip").cast(IntegerType), 58 | df("attributes")(0).as("fuzzyness").cast(DoubleType)) 59 | inputDataFrame.cache() 60 | inputDataFrame.count() 61 | val dataFrameTimeings = 1.to(10).map(x => time(testOnDataFrame(inputDataFrame))) 62 | println(rddTimeings.map(_._2).mkString(",")) 63 | println(groupTimeings.map(_._2).mkString(",")) 64 | println(dataFrameTimeings.map(_._2).mkString(",")) 65 | } 66 | 67 | def testOnRDD(rdd: RDD[(Int, Double)]): Long = { 68 | val kvc: RDD[(Int, (Double , Int))] = rdd.map{case (x, y) => (x, (y, 1))} 69 | kvc.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).count() 70 | } 71 | 72 | def groupOnRDD(rdd: RDD[(Int, Double)]) = { 73 | rdd.groupByKey().mapValues{v => 74 | v.aggregate((0.0, 0))({case (x, y) => (x._1 + y, x._2 + 1)}, 75 | {case (x, y) => (x._1 + y._1, x._2 + y._2)})}.count() 76 | } 77 | 78 | def testOnDataFrame(df: DataFrame) = { 79 | df.groupBy("zip").avg("fuzzyness").count() 80 | } 81 | 82 | def time[R](block: => R): (R, Long) = { 83 | val t0 = System.nanoTime() 84 | val result = block // call-by-name 85 | val t1 = System.nanoTime() 86 | println(s"Time ${t1 - t0}ns") 87 | (result, t1 - t0) 88 | } 89 | // $COVERAGE-ON$ 90 | } 91 | -------------------------------------------------------------------------------- /Dockerfile-mini: -------------------------------------------------------------------------------- 1 | # Open JDK11, Spark 3.X and the latest JDKs get a little spicy 2 | FROM azul/zulu-openjdk:11-latest 3 | 4 | RUN apt-get -qq update && \ 5 | apt-get -qq -y upgrade && \ 6 | apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \ 7 | locale-gen en_US.UTF-8 && \ 8 | apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \ 9 | echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ 10 | echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ 11 | curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ 12 | chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ 13 | apt-get update && \ 14 | apt-get -qq -y install sbt && \ 15 | rm -rf /var/lib/apt/lists/* 16 | 17 | RUN curl -Lo coursier https://git.io/coursier-cli 18 | RUN chmod +x coursier 19 | # ensure the JAR of the CLI is in the coursier cache, in the image 20 | RUN ./coursier --help 21 | RUN pip install --no-cache-dir jupyter 22 | # Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3 23 | #RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8 24 | RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb 25 | 26 | RUN ./coursier bootstrap \ 27 | -r jitpack \ 28 | -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \ 29 | sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \ 30 | --default=true --sources \ 31 | -o almond && \ 32 | ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" 33 | 34 | 35 | RUN adduser dev 36 | RUN adduser dev sudo 37 | RUN echo 'dev:dev' | chpasswd 38 | RUN mkdir -p ~dev 39 | RUN cp ./coursier ~dev/ 40 | RUN echo "color_prompt=yes" >> ~dev/.bashrc 41 | RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc 42 | RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc 43 | RUN chown -R dev ~dev 44 | USER dev 45 | # Kernels are installed in user so we need to run as the user 46 | RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" 47 | USER root 48 | 49 | RUN mkdir -p /high-performance-spark-examples 50 | RUN mkdir -p /high-performance-spark-examples/warehouse 51 | RUN chown -R dev /high-performance-spark-examples 52 | WORKDIR /high-performance-spark-examples 53 | # Increase the chance of caching by copying just the env setup file first. 54 | COPY --chown=dev:dev env_setup.sh ./ 55 | # Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place 56 | # Also downloads some test data 57 | RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz 58 | RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back 59 | # Note: We need to use /home in the COPY otherwise no happy pandas 60 | COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new 61 | RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json 62 | RUN chown -R dev /high-performance-spark-examples 63 | ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/ 64 | RUN git clone https://github.com/holdenk/spark-upgrade.git 65 | RUN chown -R dev /high-performance-spark-examples 66 | USER dev 67 | RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history 68 | CMD ["/high-performance-spark-examples/misc/container_launch.sh"] 69 | 70 | -------------------------------------------------------------------------------- /python/examples/spark_expectations_example.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkFiles 2 | from pyspark.sql import * 3 | from spark_expectations.core.expectations import ( 4 | SparkExpectations, 5 | WrappedDataFrameWriter, 6 | ) 7 | 8 | spark = SparkSession.builder.master("local[4]").getOrCreate() 9 | sc = spark.sparkContext 10 | sc.setLogLevel("ERROR") 11 | 12 | # tag::global_setup[] 13 | se_conf = { 14 | "se_notifications_enable_email": False, 15 | "se_notifications_email_smtp_host": "mailhost.example.com", 16 | "se_notifications_email_smtp_port": 25, 17 | "se_notifications_email_from": "timbit@example.com", 18 | "se_notifications_email_subject": "spark expectations - data quality - notifications", 19 | "se_notifications_on_fail": True, 20 | "se_notifications_on_error_drop_exceeds_threshold_breach": True, 21 | "se_notifications_on_error_drop_threshold": 15, 22 | } 23 | # end::global_setup[] 24 | 25 | 26 | # tag::setup_and_load[] 27 | from spark_expectations.config.user_config import Constants as user_config 28 | 29 | spark.sql("DROP TABLE IF EXISTS local.magic_validation") 30 | spark.sql( 31 | """ 32 | create table local.magic_validation ( 33 | product_id STRING, 34 | table_name STRING, 35 | rule_type STRING, 36 | rule STRING, 37 | column_name STRING, 38 | expectation STRING, 39 | action_if_failed STRING, 40 | tag STRING, 41 | description STRING, 42 | enable_for_source_dq_validation BOOLEAN, 43 | enable_for_target_dq_validation BOOLEAN, 44 | is_active BOOLEAN, 45 | enable_error_drop_alert BOOLEAN, 46 | error_drop_threshold INT 47 | )""" 48 | ) 49 | # Reminder: addFile does not handle directories well. 50 | rule_file = "spark_expectations_sample_rules.json" 51 | sc.addFile(rule_file) 52 | df = spark.read.json(SparkFiles.get(rule_file)) 53 | print(df) 54 | df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation") 55 | spark.read.table("local.magic_validation").show() 56 | 57 | # Can be used to point to your desired metastore. 58 | se_writer = WrappedDataFrameWriter().mode("append").format("iceberg") 59 | 60 | rule_df = spark.sql("select * from local.magic_validation") 61 | 62 | se: SparkExpectations = SparkExpectations( 63 | rules_df=rule_df, # See if we can replace this with the DF we wrote out. 64 | product_id="pay", # We will only apply rules matching this product id 65 | stats_table="local.dq_stats", 66 | stats_table_writer=se_writer, 67 | target_and_error_table_writer=se_writer, 68 | stats_streaming_options={user_config.se_enable_streaming: False}, 69 | ) 70 | # end::setup_and_load[] 71 | rule_df.show(truncate=200) 72 | 73 | 74 | # tag::run_validation_row[] 75 | @se.with_expectations( 76 | user_conf=se_conf, 77 | write_to_table=False, # If set to true SE will write to the target table. 78 | target_and_error_table_writer=se_writer, 79 | # target_table is used to create the error table (e.g. here local.fake_table_name_error) 80 | # and filter the rules on top of the global product filter. 81 | target_table="local.fake_table_name", 82 | ) 83 | def load_data(): 84 | raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) 85 | uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") 86 | return uk_df 87 | 88 | 89 | # data = load_data() 90 | # end::run_validation_row[] 91 | 92 | 93 | # tag::run_validation_complex[] 94 | @se.with_expectations( 95 | user_conf=se_conf, 96 | write_to_table=True, # If set to true SE will write to the target table. 97 | target_and_error_table_writer=se_writer, 98 | # target_table is used to create the error table (e.g. here local.fake_table_name_error) 99 | # and filter the rules on top of the global product filter. 100 | target_table="local.3rd_fake", 101 | ) 102 | def load_data2(): 103 | raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) 104 | uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent") 105 | return uk_df 106 | 107 | 108 | data = load_data2() 109 | # end::run_validation_complex[] 110 | 111 | spark.sql("SELECT * FROM local.3rd_fake_error").show(truncate=300) 112 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Illustrates how to use Spark accumulators. Note that most of these examples 3 | * are "dangerous" in that they may not return consistent results. 4 | */ 5 | package com.highperformancespark.examples.transformations 6 | 7 | import java.{lang => jl} 8 | 9 | import scala.collection.mutable.HashSet 10 | 11 | import org.apache.spark._ 12 | import org.apache.spark.rdd._ 13 | import org.apache.spark.util.AccumulatorV2 14 | 15 | import com.highperformancespark.examples.dataframe.RawPanda 16 | object Accumulators { 17 | /** 18 | * Compute the total fuzzyness with an accumulator while generating 19 | * an id and zip pair for sorting. 20 | */ 21 | //tag::sumFuzzyAcc[] 22 | def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): 23 | (RDD[(String, Long)], Double) = { 24 | // Create an accumulator with the initial value of 0.0 25 | val acc = sc.doubleAccumulator 26 | val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} 27 | // accumulator still has zero value 28 | // Note: This example is dangerous since the transformation may be 29 | // evaluated multiple times. 30 | transformed.count() // force evaluation 31 | (transformed, acc.value) 32 | } 33 | //end::sumFuzzyAcc[] 34 | 35 | /** 36 | * Compute the max fuzzyness with an accumulator while generating an 37 | * id and zip pair for sorting. 38 | */ 39 | //tag::maxFuzzyAcc[] 40 | def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]): 41 | (RDD[(String, Long)], Double) = { 42 | class MaxDoubleParam extends AccumulatorV2[jl.Double, jl.Double] { 43 | var _value = Double.MinValue 44 | override def isZero(): Boolean = { 45 | _value == Double.MinValue 46 | } 47 | override def reset() = { 48 | _value = Double.MinValue 49 | } 50 | 51 | override def add(r1: jl.Double): Unit = { 52 | _value = Math.max(r1, _value) 53 | } 54 | 55 | def add(r1: Double): Unit = { 56 | _value = Math.max(r1, _value) 57 | } 58 | 59 | def copy(): MaxDoubleParam = { 60 | val newAcc = new MaxDoubleParam() 61 | newAcc._value = _value 62 | newAcc 63 | } 64 | 65 | override def merge(other: AccumulatorV2[jl.Double, jl.Double]): Unit = other match { 66 | case o: MaxDoubleParam => 67 | _value = Math.max(_value, o._value) 68 | case _ => 69 | throw new UnsupportedOperationException( 70 | s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") 71 | } 72 | 73 | override def value: jl.Double = _value 74 | } 75 | // Create an accumulator with the initial value of Double.MinValue 76 | val acc = new MaxDoubleParam() 77 | sc.register(acc) 78 | val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)} 79 | // accumulator still has Double.MinValue 80 | // Note: This example is dangerous since the transformation may be 81 | // evaluated multiple times. 82 | transformed.count() // force evaluation 83 | (transformed, acc.value) 84 | } 85 | //end::maxFuzzyAcc[] 86 | 87 | //tag::uniquePandaAcc[] 88 | def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = { 89 | class UniqParam extends AccumulatorV2[Long, HashSet[Long]] { 90 | val _values = new HashSet[Long] 91 | override def isZero() = _values.isEmpty 92 | 93 | override def copy(): UniqParam = { 94 | val nacc = new UniqParam 95 | nacc._values ++= _values 96 | nacc 97 | } 98 | 99 | override def reset(): Unit = { 100 | _values.clear() 101 | } 102 | 103 | override def merge(other: AccumulatorV2[Long, HashSet[Long]]): Unit = other match { 104 | case o: UniqParam => 105 | _values ++= o._values 106 | case _ => 107 | throw new UnsupportedOperationException( 108 | s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") 109 | } 110 | 111 | override def value: HashSet[Long] = _values 112 | // For adding new values 113 | override def add(t: Long) = { 114 | _values += t 115 | } 116 | } 117 | // Create an accumulator with the initial value of Double.MinValue 118 | val acc = new UniqParam() 119 | sc.register(acc) 120 | val transformed = rdd.map{x => acc.add(x.id); (x.zip, x.id)} 121 | // accumulator still has zero values 122 | transformed.count() // force evaluation 123 | acc.value 124 | } 125 | //end::uniquePandaAcc[] 126 | } 127 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * A sample mixing relational & functional transformations with Datasets. 3 | */ 4 | package com.highperformancespark.examples.dataframe 5 | 6 | import org.apache.spark._ 7 | import org.apache.spark.rdd.RDD 8 | import org.apache.spark.sql._ 9 | import org.apache.spark.sql.catalyst.expressions.aggregate._ 10 | import org.apache.spark.sql.expressions._ 11 | import org.apache.spark.sql.functions._ 12 | import org.apache.spark.sql.hive._ 13 | import org.apache.spark.sql.hive.thriftserver._ 14 | import org.apache.spark.sql.types._ 15 | 16 | case class MiniPandaInfo(zip: String, size: Double) 17 | 18 | class MixedDataset(sqlCtx: SQLContext) { 19 | import sqlCtx.implicits._ 20 | 21 | /** 22 | * A sample function on a Dataset of RawPandas. 23 | * 24 | * This is contrived, since our reduction could also be done with SQL aggregates, 25 | * but we can see the flexibility of being able to specify arbitrary Scala code. 26 | */ 27 | def happyPandaSums(ds: Dataset[RawPanda]): Double = { 28 | ds.toDF().filter($"happy" === true).as[RawPanda]. 29 | select($"attributes"(0).as[Double]). 30 | reduce((x, y) => x + y) 31 | } 32 | 33 | /** 34 | * A sample function on a Dataset of RawPandas. 35 | * Use the first attribute to deterimine if a panda is squishy. 36 | */ 37 | //tag::basicSelect[] 38 | def squishyPandas(ds: Dataset[RawPanda]): Dataset[(Long, Boolean)] = { 39 | ds.select($"id".as[Long], ($"attributes"(0) > 0.5).as[Boolean]) 40 | } 41 | //end::basicSelect[] 42 | 43 | /** 44 | * Union happy and sad pandas 45 | */ 46 | //tag::basicUnion[] 47 | def unionPandas(happyPandas: Dataset[RawPanda], sadPandas: Dataset[RawPanda]) = { 48 | happyPandas.union(sadPandas) 49 | } 50 | //end::basicUnion[] 51 | 52 | /** 53 | * Functional map + Dataset, sums the positive attributes for the pandas 54 | */ 55 | //tag::functionalQuery[] 56 | def funMap(ds: Dataset[RawPanda]): Dataset[Double] = { 57 | ds.map{rp => rp.attributes.filter(_ > 0).sum} 58 | } 59 | //end::functionalQuery[] 60 | 61 | //tag::maxPandaSizePerZip[] 62 | def maxPandaSizePerZip(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { 63 | ds.map(rp => MiniPandaInfo(rp.zip, rp.attributes(2))) 64 | .groupByKey(mp => mp.zip).agg(max("size").as[Double]) 65 | } 66 | //end::maxPandaSizePerZip[] 67 | 68 | //tag::maxPandaSizePerZipScala[] 69 | def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = { 70 | def groupMapFun(g: String, iter: Iterator[RawPanda]): (String, Double) = { 71 | (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _))) 72 | } 73 | ds.groupByKey(rp => rp.zip).mapGroups(groupMapFun) 74 | } 75 | //end::maxPandaSizePerZipScala[] 76 | 77 | /** 78 | * Illustrate how we make typed queries, using some of the float properties 79 | * to produce boolean values. 80 | */ 81 | def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = { 82 | ds.select($"attributes"(0).as[Double]) 83 | } 84 | 85 | /** 86 | * Illustrate Dataset joins 87 | */ 88 | def joinSample(pandas: Dataset[RawPanda], coffeeShops: Dataset[CoffeeShop]): 89 | Dataset[(RawPanda, CoffeeShop)] = { 90 | //tag::joinWith[] 91 | val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops, 92 | pandas("zip") === coffeeShops("zip")) 93 | //end::joinWith[] 94 | result 95 | } 96 | 97 | /** 98 | * Illustrate a self join to compare pandas in the same zip code 99 | */ 100 | def selfJoin(pandas: Dataset[RawPanda]): 101 | Dataset[(RawPanda, RawPanda)] = { 102 | //tag::selfJoin[] 103 | val result: Dataset[(RawPanda, RawPanda)] = pandas.as("l").joinWith(pandas.as("r"), 104 | $"l.zip" === $"r.zip") 105 | //end::selfJoin[] 106 | result 107 | } 108 | 109 | //tag::fromRDD[] 110 | /** 111 | * Illustrate converting an RDD to DS 112 | */ 113 | def fromRDD(rdd: RDD[RawPanda]): Dataset[RawPanda] = { 114 | rdd.toDS 115 | } 116 | 117 | //end::fromRDD[] 118 | 119 | //tag::toRDDDF[] 120 | /** 121 | * Illustrate converting a Dataset to an RDD 122 | */ 123 | def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = { 124 | ds.rdd 125 | } 126 | 127 | /** 128 | * Illustrate converting a Dataset to a DataFrame 129 | */ 130 | def toDF(ds: Dataset[RawPanda]): DataFrame = { 131 | ds.toDF() 132 | } 133 | //end::toRDDDF[] 134 | 135 | /** 136 | * Illustrate DataFrame to Dataset. Its important to note that if the schema 137 | * does not match what is expected by the Dataset this fails fast. 138 | */ 139 | //tag::DataFrameAsDataset[] 140 | def fromDF(df: DataFrame): Dataset[RawPanda] = { 141 | df.as[RawPanda] 142 | } 143 | //end::DataFrameAsDataset[] 144 | } 145 | -------------------------------------------------------------------------------- /python/examples/bad_pyspark.py: -------------------------------------------------------------------------------- 1 | # This script triggers a number of different PySpark errors 2 | 3 | from pyspark.sql.session import SparkSession 4 | import sys 5 | 6 | global sc 7 | 8 | 9 | def nonExistentInput(sc): 10 | """ 11 | Attempt to load non existent input 12 | >>> nonExistentInput(sc) 13 | Traceback (most recent call last): 14 | ... 15 | Py4JJavaError:... 16 | """ 17 | # tag::nonExistent[] 18 | failedRdd = sc.textFile("file:///doesnotexist") 19 | failedRdd.count() 20 | # end::nonExistent[] 21 | 22 | 23 | def throwOuter(sc): 24 | """ 25 | Attempt to load non existant input 26 | >>> throwOuter(sc) 27 | Traceback (most recent call last): 28 | ... 29 | Py4JJavaError:... 30 | """ 31 | # tag::throwOuter[] 32 | data = sc.parallelize(range(10)) 33 | transform1 = data.map(lambda x: x + 1) 34 | transform2 = transform1.map(lambda x: x / 0) 35 | transform2.count() 36 | # end::throwOuter[] 37 | 38 | 39 | def throwInner(sc): 40 | """ 41 | Attempt to load non existant input 42 | >>> throwInner(sc) 43 | Traceback (most recent call last): 44 | ... 45 | Py4JJavaError:... 46 | """ 47 | # tag::throwInner[] 48 | data = sc.parallelize(range(10)) 49 | transform1 = data.map(lambda x: x / 0) 50 | transform2 = transform1.map(lambda x: x + 1) 51 | transform2.count() 52 | # end::throwInner[] 53 | 54 | 55 | # tag::rewrite[] 56 | def add1(x): 57 | """ 58 | Add 1 59 | >>> add1(2) 60 | 3 61 | """ 62 | return x + 1 63 | 64 | 65 | def divZero(x): 66 | """ 67 | Divide by zero (cause an error) 68 | >>> divZero(2) 69 | Traceback (most recent call last): 70 | ... 71 | ZeroDivisionError: integer division or modulo by zero 72 | """ 73 | return x / 0 74 | 75 | 76 | def throwOuter2(sc): 77 | """ 78 | Attempt to load non existant input 79 | >>> throwOuter2(sc) 80 | Traceback (most recent call last): 81 | ... 82 | Py4JJavaError:... 83 | """ 84 | data = sc.parallelize(range(10)) 85 | transform1 = data.map(add1) 86 | transform2 = transform1.map(divZero) 87 | transform2.count() 88 | 89 | 90 | def throwInner2(sc): 91 | """ 92 | Attempt to load non existant input 93 | >>> throwInner2(sc) 94 | Traceback (most recent call last): 95 | ... 96 | Py4JJavaError:... 97 | """ 98 | data = sc.parallelize(range(10)) 99 | transform1 = data.map(divZero) 100 | transform2 = transform1.map(add1) 101 | transform2.count() 102 | 103 | 104 | # end::rewrite[] 105 | 106 | 107 | def throwInner3(sc): 108 | """ 109 | Attempt to load non existant input 110 | >>> throwInner3(sc) 111 | Reject 10 112 | """ 113 | data = sc.parallelize(range(10)) 114 | rejectedCount = sc.accumulator(0) 115 | 116 | def loggedDivZero(x): 117 | import logging 118 | 119 | try: 120 | return [x / 0] 121 | except Exception as e: 122 | rejectedCount.add(1) 123 | logging.warning("Error found " + repr(e)) 124 | return [] 125 | 126 | transform1 = data.flatMap(loggedDivZero) 127 | transform2 = transform1.map(add1) 128 | transform2.count() 129 | print("Reject " + str(rejectedCount.value)) 130 | 131 | 132 | def runOutOfMemory(sc): 133 | """ 134 | Run out of memory on the workers from a skewed shuffle. 135 | >>> runOutOfMemory(sc) # doctest: +SKIP 136 | Traceback (most recent call last): 137 | ... 138 | Py4JJavaError:... 139 | """ 140 | # tag::worker_oom[] 141 | data = sc.parallelize(range(10000)) 142 | 143 | def generate_too_much(i: int): 144 | return list(map(lambda v: (i % 2, v), range(100000 * i))) 145 | 146 | bad = data.flatMap(generate_too_much).groupByKey() 147 | bad.count() 148 | # end::worker_oom[] 149 | 150 | 151 | def _setupTest(): 152 | globs = globals() 153 | spark = SparkSession.builder.master("local[4]").getOrCreate() 154 | sc = spark._sc 155 | globs["sc"] = sc 156 | return globs 157 | 158 | 159 | def _test(): 160 | """ 161 | Run the tests. 162 | Note this will print a lot of error message to stderr since we don't 163 | capture the JVM sub process stdout/stderr for doctests. 164 | """ 165 | import doctest 166 | 167 | globs = _setupTest() 168 | (failure_count, test_count) = doctest.testmod( 169 | globs=globs, optionflags=doctest.ELLIPSIS 170 | ) 171 | print("All tests done, stopping Spark context.") 172 | globs["sc"].stop() 173 | if failure_count: 174 | exit(-1) 175 | else: 176 | exit(0) 177 | 178 | 179 | if __name__ == "__main__": 180 | _test() 181 | # Hack to support running in nose 182 | elif sys.stdout != sys.__stdout__: 183 | _setupTest() 184 | -------------------------------------------------------------------------------- /core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.dataframe; 2 | 3 | import com.highperformancespark.examples.objects.JavaPandaPlace; 4 | import com.highperformancespark.examples.objects.JavaRawPanda; 5 | import org.apache.spark.api.java.JavaRDD; 6 | import org.apache.spark.sql.*; 7 | import org.apache.spark.sql.types.*; 8 | 9 | import java.util.List; 10 | import java.util.Properties; 11 | import java.util.stream.Collectors; 12 | 13 | public class JavaLoadSave { 14 | private SQLContext sqlContext; 15 | 16 | public JavaLoadSave(SQLContext sqlContext) { 17 | this.sqlContext = sqlContext; 18 | } 19 | 20 | //tag::createFromRDD[] 21 | public Dataset createFromJavaBean(JavaRDD input) { 22 | // Create DataFrame using Java Bean 23 | Dataset df1 = sqlContext.createDataFrame(input, JavaPandaPlace.class); 24 | 25 | // Create DataFrame using JavaRDD 26 | JavaRDD rowRDD = input.map(pm -> RowFactory.create(pm.getName(), 27 | pm.getPandas().stream() 28 | .map(pi -> RowFactory.create(pi.getId(), pi.getZip(), pi.isHappy(), pi.getAttributes())) 29 | .collect(Collectors.toList()))); 30 | 31 | ArrayType pandasType = DataTypes.createArrayType(new StructType( 32 | new StructField[]{ 33 | new StructField("id", DataTypes.LongType, true, Metadata.empty()), 34 | new StructField("zip", DataTypes.StringType, true, Metadata.empty()), 35 | new StructField("happy", DataTypes.BooleanType, true, Metadata.empty()), 36 | new StructField("attributes", DataTypes.createArrayType(DataTypes.FloatType), true, Metadata.empty()) 37 | } 38 | )); 39 | 40 | StructType schema = new StructType(new StructField[]{ 41 | new StructField("name", DataTypes.StringType, true, Metadata.empty()), 42 | new StructField("pandas", pandasType, true, Metadata.empty()) 43 | }); 44 | 45 | Dataset df2 = sqlContext.createDataFrame(rowRDD, schema); 46 | return df2; 47 | } 48 | //end::createFromRDD[] 49 | 50 | //tag::createFromLocal[] 51 | public Dataset createFromLocal(List input) { 52 | return sqlContext.createDataFrame(input, PandaPlace.class); 53 | } 54 | //end::createFromLocal[] 55 | 56 | //tag::collectResults[] 57 | public List collectDF(Dataset df) { 58 | return df.collectAsList(); 59 | } 60 | //end::collectResults[] 61 | 62 | //tag::toRDD[] 63 | public JavaRDD toRDD(Dataset input) { 64 | JavaRDD rdd = input.javaRDD().map(row -> new JavaRawPanda(row.getLong(0), row.getString(1), 65 | row.getString(2), row.getBoolean(3), row.getList(4))); 66 | return rdd; 67 | } 68 | //end::toRDD[] 69 | 70 | //tag::partitionedOutput[] 71 | public void writeOutByZip(Dataset input) { 72 | input.write().partitionBy("zipcode").format("json").save("output/"); 73 | } 74 | //end::partitionedOutput[] 75 | 76 | //tag::saveAppend[] 77 | public void writeAppend(Dataset input) { 78 | input.write().mode(SaveMode.Append).save("output/"); 79 | } 80 | //end::saveAppend[] 81 | 82 | public Dataset createJDBC() { 83 | //tag::createJDBC[] 84 | Dataset df1 = sqlContext.read().jdbc("jdbc:dialect:serverName;user=user;password=pass", 85 | "table", new Properties()); 86 | 87 | Dataset df2 = sqlContext.read().format("jdbc") 88 | .option("url", "jdbc:dialect:serverName") 89 | .option("dbtable", "table").load(); 90 | 91 | return df2; 92 | //end::createJDBC[] 93 | } 94 | 95 | public void writeJDBC(Dataset df) { 96 | //tag::writeJDBC[] 97 | df.write().jdbc("jdbc:dialect:serverName;user=user;password=pass", 98 | "table", new Properties()); 99 | 100 | df.write().format("jdbc") 101 | .option("url", "jdbc:dialect:serverName") 102 | .option("user", "user") 103 | .option("password", "pass") 104 | .option("dbtable", "table").save(); 105 | //end::writeJDBC[] 106 | } 107 | 108 | //tag::loadParquet[] 109 | public Dataset loadParquet(String path) { 110 | // Configure Spark to read binary data as string, note: must be configured on SQLContext 111 | sqlContext.setConf("spark.sql.parquet.binaryAsString", "true"); 112 | 113 | // Load parquet data using merge schema (configured through option) 114 | Dataset df = sqlContext.read() 115 | .option("mergeSchema", "true") 116 | .format("parquet") 117 | .load(path); 118 | 119 | return df; 120 | } 121 | //end::loadParquet[] 122 | 123 | //tag::writeParquet[] 124 | public void writeParquet(Dataset df, String path) { 125 | df.write().format("parquet").save(path); 126 | } 127 | //end::writeParquet[] 128 | 129 | //tag::loadHiveTable[] 130 | public Dataset loadHiveTable() { 131 | return sqlContext.read().table("pandas"); 132 | } 133 | //end::loadHiveTable[] 134 | 135 | //tag::saveManagedTable[] 136 | public void saveManagedTable(Dataset df) { 137 | df.write().saveAsTable("pandas"); 138 | } 139 | //end::saveManagedTable[] 140 | } 141 | -------------------------------------------------------------------------------- /python/examples/simple_perf.py: -------------------------------------------------------------------------------- 1 | # When running this example make sure to include the built Scala jar : 2 | # 3 | # $SPARK_HOME/bin/pyspark --jars \ 4 | # ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar 5 | # 6 | # This example illustrates how to interface Scala and Python code, but caution 7 | # should be taken as it depends on many private members that may change in 8 | # future releases of Spark. 9 | 10 | from pyspark.sql.types import StructType, IntegerType, DoubleType, StructField 11 | from pyspark.sql import DataFrame, SparkSession 12 | import sys 13 | import timeit 14 | import time 15 | 16 | 17 | def generate_scale_data(sqlCtx, rows, numCols): 18 | """ 19 | Generate scale data for the performance test. 20 | 21 | This also illustrates calling custom Scala code from the driver. 22 | 23 | .. Note: This depends on many internal methods and may break between versions. 24 | 25 | # This assumes our jars have been added with export PYSPARK_SUBMIT_ARGS 26 | >>> session = SparkSession.builder.getOrCreate() 27 | >>> scaleData = generate_scale_data(session, 100L, 1) 28 | >>> scaleData[0].count() 29 | 100 30 | >>> scaleData[1].count() 31 | 100 32 | >>> session.stop() 33 | """ 34 | # tag::javaInterop[] 35 | sc = sqlCtx._sc 36 | javaSparkSession = sqlCtx._jsparkSession 37 | jsc = sc._jsc 38 | scalasc = jsc.sc() 39 | gateway = sc._gateway 40 | # Call a java method that gives us back an RDD of JVM Rows (Int, Double) 41 | # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are 42 | # different, so we can't directly wrap this. 43 | # This returns a Java RDD of Rows - normally it would better to 44 | # return a DataFrame directly, but for illustration we will work 45 | # with an RDD of Rows. 46 | java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData.generateMiniScaleRows( 47 | scalasc, rows, numCols 48 | ) 49 | # Schemas are serialized to JSON and sent back and forth 50 | # Construct a Python Schema and turn it into a Java Schema 51 | schema = StructType( 52 | [StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())] 53 | ) 54 | jschema = javaSparkSession.parseDataType(schema.json()) 55 | # Convert the Java RDD to Java DataFrame 56 | java_dataframe = javaSparkSession.createDataFrame(java_rdd, jschema) 57 | # Wrap the Java DataFrame into a Python DataFrame 58 | python_dataframe = DataFrame(java_dataframe, sqlCtx) 59 | # Convert the Python DataFrame into an RDD 60 | pairRDD = python_dataframe.rdd.map(lambda row: (row[0], row[1])) 61 | return (python_dataframe, pairRDD) 62 | # end::javaInterop[] 63 | 64 | 65 | def runOnDF(df): 66 | result = df.groupBy("zip").avg("fuzzyness").count() 67 | return result 68 | 69 | 70 | def runOnRDD(rdd): 71 | result = ( 72 | rdd.map(lambda xy: (xy[0], (xy[1], 1))) 73 | .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1])) 74 | .count() 75 | ) 76 | return result 77 | 78 | 79 | def groupOnRDD(rdd): 80 | return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count() 81 | 82 | 83 | def run(sc, sqlCtx, scalingFactor, size): 84 | """ 85 | Run the simple perf test printing the results to stdout. 86 | 87 | >>> session = SparkSession.builder.getOrCreate() 88 | >>> sc = session._sc 89 | >>> run(sc, session, 5L, 1) 90 | RDD: 91 | ... 92 | group: 93 | ... 94 | df: 95 | ... 96 | yay 97 | >>> session.stop() 98 | """ 99 | (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size) 100 | input_rdd.cache().count() 101 | rddTimeings = timeit.repeat( 102 | stmt=lambda: runOnRDD(input_rdd), 103 | repeat=10, 104 | number=1, 105 | timer=time.time, 106 | setup="gc.enable()", 107 | ) 108 | groupTimeings = timeit.repeat( 109 | stmt=lambda: groupOnRDD(input_rdd), 110 | repeat=10, 111 | number=1, 112 | timer=time.time, 113 | setup="gc.enable()", 114 | ) 115 | input_df.cache().count() 116 | dfTimeings = timeit.repeat( 117 | stmt=lambda: runOnDF(input_df), 118 | repeat=10, 119 | number=1, 120 | timer=time.time, 121 | setup="gc.enable()", 122 | ) 123 | print(f"RDD: {rddTimeings}, group: {groupTimeings}, df: {dfTimeings}") 124 | 125 | 126 | def parseArgs(args): 127 | """ 128 | Parse the args, no error checking. 129 | 130 | >>> parseArgs(["foobaz", "1", "2"]) 131 | (1, 2) 132 | """ 133 | scalingFactor = int(args[1]) 134 | size = int(args[2]) 135 | return (scalingFactor, size) 136 | 137 | 138 | if __name__ == "__main__": 139 | """ 140 | Usage: simple_perf_test scalingFactor size 141 | """ 142 | 143 | scalingFactor = 1 144 | size = 1 145 | if len(sys.argv) > 2: 146 | (scalingFactor, size) = parseArgs(sys.argv) 147 | session = SparkSession.builder.appName("SimplePythonPerf").getOrCreate() 148 | sc = session._sc 149 | run(sc, session, scalingFactor, size) 150 | 151 | sc.stop() 152 | -------------------------------------------------------------------------------- /core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.goldilocks 2 | 3 | import scala.collection.immutable.IndexedSeq 4 | 5 | import org.apache.spark.SparkContext 6 | import org.apache.spark.rdd.RDD 7 | import org.apache.spark.sql.DataFrame 8 | import org.apache.spark.sql.Row 9 | import org.apache.spark.sql.SQLContext 10 | import org.apache.spark.sql.types.DoubleType 11 | import org.apache.spark.sql.types.StructField 12 | import org.apache.spark.sql.types.StructType 13 | 14 | import com.holdenkarau.spark.testing.SharedSparkContext 15 | import org.scalatest.funsuite.AnyFunSuite 16 | import org.apache.spark.sql.SparkSession 17 | 18 | class GoldilocksLargeTests extends AnyFunSuite with SharedSparkContext{ 19 | 20 | 21 | def testGoldilocksImplementations( 22 | data: DataFrame, targetRanks: List[Long], 23 | expectedResult: Map[Int, Iterable[Long]]) = { 24 | 25 | val iterative = 26 | GoldilocksWhileLoop.findRankStatistics(data, targetRanks) 27 | val groupByKey = 28 | GoldilocksGroupByKey.findRankStatistics(data, targetRanks) 29 | val firstTry = 30 | GoldilocksFirstTry.findRankStatistics(data, targetRanks) 31 | val hashMap = 32 | GoldilocksWithHashMap.findRankStatistics(data, targetRanks) 33 | val secondarySort = 34 | GoldilocksSecondarySort.findRankStatistics(data, targetRanks, 35 | data.rdd.partitions.length) 36 | val secondarySortV2 = 37 | GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks) 38 | 39 | expectedResult.foreach { 40 | case((i, ranks)) => 41 | assert(iterative(i).equals(ranks), 42 | "The Iterative solution to goldilocks was incorrect for column " + i) 43 | assert(groupByKey(i).equals(ranks), 44 | "Group by key solution was incorrect") 45 | assert(firstTry(i).equals(ranks), 46 | "GoldilocksFirstTry incorrect for column " + i ) 47 | assert(hashMap(i).equals(ranks), 48 | "GoldilocksWithhashMap incorrect for column " + i) 49 | assert(secondarySort(i).equals(ranks)) 50 | assert(secondarySortV2(i).equals(ranks)) 51 | 52 | } 53 | } 54 | 55 | test("Goldilocks on local data solution "){ 56 | val sqlContext = SparkSession.builder.getOrCreate().sqlContext 57 | val testRanks = List(3L, 8L) 58 | val (smallTestData, result) = 59 | DataCreationUtils.createLocalTestData(5, 10, testRanks) 60 | val schema = StructType( 61 | result.keys.toSeq.map( 62 | n => StructField("Column" + n.toString, DoubleType) 63 | )) 64 | val smallTestDF: DataFrame = 65 | sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema) 66 | testGoldilocksImplementations(smallTestDF, testRanks, result) 67 | } 68 | } 69 | 70 | object DataCreationUtils { 71 | def createLocalTestData(numberCols: Int, numberOfRows: Int, 72 | targetRanks: List[Long]) = { 73 | 74 | val cols = Range(0,numberCols).toArray 75 | val scalers = cols.map(x => 1.0) 76 | val rowRange = Range(0, numberOfRows) 77 | val columnArray: Array[IndexedSeq[Double]] = cols.map( 78 | columnIndex => { 79 | val columnValues = rowRange.map( 80 | x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex)) 81 | columnValues 82 | }) 83 | val rows = rowRange.map( 84 | rowIndex => { 85 | Row.fromSeq(cols.map( colI => columnArray(colI)(rowIndex)).toSeq) 86 | }) 87 | 88 | 89 | val result: Map[Int, Iterable[Long]] = cols.map(i => { 90 | (i, targetRanks.map(r => Math.round((r-1)/scalers(i)))) 91 | }).toMap 92 | 93 | (rows, result) 94 | } 95 | 96 | 97 | def createDistributedData(sc: SparkContext, partitions: Int, 98 | elementsPerPartition: Int, numberOfColumns: Int ) = { 99 | val partitionsStart: RDD[Int] = sc.parallelize( 100 | Array.fill(partitions)(1)) 101 | partitionsStart.repartition(partitions) 102 | 103 | var data: RDD[(Long, List[Int])] = partitionsStart.mapPartitionsWithIndex { 104 | case (partIndex, elements) => 105 | val rows = Range(0, elementsPerPartition) 106 | .map(x => (Math.random(), x)) 107 | .map { 108 | case ((randomNumber, rowValue)) => 109 | (randomNumber, 110 | //index of element 111 | (partIndex * elementsPerPartition.toLong + rowValue, 112 | List(rowValue + partIndex * elementsPerPartition))) 113 | } 114 | rows.toIterator 115 | }.sortByKey().values 116 | 117 | 118 | Range(0, numberOfColumns).foreach(x => { 119 | val nextColumn: RDD[(Long, Int)] = partitionsStart.mapPartitionsWithIndex { 120 | case (partIndex, elements) => 121 | val rows = Range(0, elementsPerPartition) 122 | .map(x => (Math.random(), x)) 123 | .map { 124 | case ((randomNumber, rowValue)) => 125 | (randomNumber, 126 | //index of element 127 | (partIndex * elementsPerPartition.toLong + rowValue, 128 | rowValue + partIndex * elementsPerPartition)) 129 | } 130 | rows.toIterator 131 | }.sortByKey().values 132 | 133 | data = nextColumn.join(data).mapValues(x => x._1 :: x._2) 134 | }) 135 | data 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.goldilocks 2 | 3 | import scala.collection.Map 4 | import scala.reflect.ClassTag 5 | 6 | import org.apache.spark.HashPartitioner 7 | import org.apache.spark.rdd.RDD 8 | 9 | object RDDJoinExamples { 10 | 11 | /* For Example, suppose we have one RDD with some data in the form (Panda id, score) 12 | and another RDD with (Panda id, address), and we want to send each Panda some mail 13 | with her best score. We could join the RDDs on ID and then compute the best score 14 | for each address. Like this: 15 | 16 | 'ToDo: Insert Example' 17 | 18 | However, this is slower than first reducing the score data, so that the 19 | //first dataset contains only one row for each Panda with her best score and then 20 | //joining that data with the address data. 21 | 22 | 'ToDO: Insert an example of this' */ 23 | //tag::joinScoresWithAddress[] 24 | def joinScoresWithAddress1( scoreRDD : RDD[(Long, Double)], 25 | addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= { 26 | val joinedRDD = scoreRDD.join(addressRDD) 27 | joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y ) 28 | } 29 | //end::joinScoresWithAddress[] 30 | 31 | //tag::leftOuterJoinScoresWithAddress[] 32 | def outerJoinScoresWithAddress(scoreRDD : RDD[(Long, Double)], 33 | addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, Option[String]))]= { 34 | val joinedRDD = scoreRDD.leftOuterJoin(addressRDD) 35 | joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y ) 36 | } 37 | //end::leftOuterJoinScoresWithAddress[] 38 | 39 | //tag::joinScoresWithAddressFast[] 40 | def joinScoresWithAddress2(scoreRDD : RDD[(Long, Double)], 41 | addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, String))]= { 42 | val bestScoreData = scoreRDD.reduceByKey((x, y) => if(x > y) x else y) 43 | bestScoreData.join(addressRDD) 44 | } 45 | //end::joinScoresWithAddressFast[] 46 | /* 47 | We could make the example in the previous section even faster, 48 | by using the partitioner for the address data as an argument for 49 | the reduce by key step. 50 | 'ToDO: Insert the code to show this here' */ 51 | //tag::joinScoresWithAddress3[] 52 | def joinScoresWithAddress3(scoreRDD: RDD[(Long, Double)], 53 | addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, String))]= { 54 | // If addressRDD has a known partitioner we should use that, 55 | // otherwise it has a default hash parttioner, which we can reconstruct by 56 | // getting the number of partitions. 57 | val addressDataPartitioner = addressRDD.partitioner match { 58 | case (Some(p)) => p 59 | case (None) => new HashPartitioner(addressRDD.partitions.length) 60 | } 61 | val bestScoreData = scoreRDD.reduceByKey(addressDataPartitioner, 62 | (x, y) => if(x > y) x else y) 63 | bestScoreData.join(addressRDD) 64 | } 65 | //end::joinScoresWithAddress3[] 66 | 67 | def debugString(scoreRDD: RDD[(Long, Double)], 68 | addressRDD: RDD[(Long, String)]) = { 69 | //tag::debugString[] 70 | scoreRDD.join(addressRDD).toDebugString 71 | //end::debugString[] 72 | } 73 | 74 | /* 75 | * Suppose we had two datasets of information about each panda, 76 | * one with the scores, and one with there favorite foods. 77 | * We could use cogroup to associate each Pandas id with an iterator 78 | * of their scores and another iterator of their favorite foods. 79 | */ 80 | def coGroupExample(scoreRDD: RDD[(Long, Double)], foodRDD: RDD[(Long, String)], 81 | addressRDD: RDD[(Long, String)]) = { 82 | //tag::coGroupExample1[] 83 | val cogroupedRDD: RDD[(Long, (Iterable[Double], Iterable[String]))] = 84 | scoreRDD.cogroup(foodRDD) 85 | //end::coGroupExample1[] 86 | 87 | /* 88 | * For example, if we needed to join the panda score data with both address 89 | * and favorite foods, it would be better to use co group than two 90 | * join operations. 91 | */ 92 | //tag::coGroupExample2[] 93 | val addressScoreFood = addressRDD.cogroup(scoreRDD, foodRDD) 94 | //end::coGroupExample2[] 95 | } 96 | 97 | /** 98 | * Performs a broadcast hash join for two RDDs. 99 | * @param bigRDD - the first rdd, should be the larger RDD 100 | * @param smallRDD - the small rdd, should be small enough to fit in memory 101 | * @tparam K - The type of the key 102 | * @tparam V1 - The type of the values for the large array 103 | * @tparam V2 - The type of the values for the second array 104 | * @return 105 | */ 106 | //tag::coreBroadcast[] 107 | def manualBroadcastHashJoin[K : Ordering : ClassTag, V1 : ClassTag, 108 | V2 : ClassTag](bigRDD : RDD[(K, V1)], 109 | smallRDD : RDD[(K, V2)])= { 110 | val smallRDDLocal: Map[K, V2] = smallRDD.collectAsMap() 111 | val smallRDDLocalBcast = bigRDD.sparkContext.broadcast(smallRDDLocal) 112 | bigRDD.mapPartitions(iter => { 113 | iter.flatMap{ 114 | case (k,v1 ) => 115 | smallRDDLocalBcast.value.get(k) match { 116 | // Note: You could switch this to a left join by changing the empty seq 117 | // to instead return Seq(k, Seq.empty[(V1, V2)]) 118 | case None => Seq.empty[(K, (V1, V2))] 119 | case Some(v2) => Seq((k, (v1, v2))) 120 | } 121 | } 122 | }, preservesPartitioning = true) 123 | } 124 | //end::coreBroadcast[] 125 | } 126 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala: -------------------------------------------------------------------------------- 1 | /** 2 | * Load and save data to/from DataFrames 3 | */ 4 | package com.highperformancespark.examples.dataframe 5 | 6 | import java.util.Properties 7 | 8 | import org.apache.spark.SparkContext 9 | import org.apache.spark.rdd._ 10 | import org.apache.spark.sql._ 11 | import org.apache.spark.sql.types._ 12 | 13 | case class LoadSave(sc: SparkContext, session: SparkSession) { 14 | import session.implicits._ 15 | //tag::createFromRDD[] 16 | def createFromCaseClassRDD(input: RDD[PandaPlace]) = { 17 | // Create DataFrame explicitly using session and schema inference 18 | val df1 = session.createDataFrame(input) 19 | 20 | // Create DataFrame using session implicits and schema inference 21 | val df2 = input.toDF() 22 | 23 | // Create a Row RDD from our RDD of case classes 24 | val rowRDD = input.map(pm => Row(pm.name, 25 | pm.pandas.map(pi => Row(pi.id, pi.zip, pi.happy, pi.attributes)))) 26 | 27 | val pandasType = ArrayType(StructType(List( 28 | StructField("id", LongType, true), 29 | StructField("zip", StringType, true), 30 | StructField("happy", BooleanType, true), 31 | StructField("attributes", ArrayType(FloatType), true)))) 32 | 33 | // Create DataFrame explicitly with specified schema 34 | val schema = StructType(List(StructField("name", StringType, true), 35 | StructField("pandas", pandasType))) 36 | 37 | val df3 = session.createDataFrame(rowRDD, schema) 38 | } 39 | //end::createFromRDD[] 40 | 41 | //tag::createFromRDDBasic[] 42 | def createFromCaseClassRDD(input: Seq[PandaPlace]) = { 43 | val rdd = sc.parallelize(input) 44 | // Create DataFrame explicitly using session and schema inference 45 | val df1 = session.createDataFrame(input) 46 | } 47 | //end::createFromRDDBasic[] 48 | 49 | //tag::createGetSchema[] 50 | def createAndPrintSchema() = { 51 | val damao = RawPanda(1, "M1B 5K7", "giant", true, Array(0.1, 0.1)) 52 | val pandaPlace = PandaPlace("toronto", Array(damao)) 53 | val df = session.createDataFrame(Seq(pandaPlace)) 54 | df.printSchema() 55 | } 56 | //end::createGetSchema[] 57 | 58 | //tag::createFromLocal[] 59 | def createFromLocal(input: Seq[PandaPlace]) = { 60 | session.createDataFrame(input) 61 | } 62 | //end::createFromLocal[] 63 | 64 | //tag::collectResults[] 65 | def collectDF(df: DataFrame) = { 66 | val result: Array[Row] = df.collect() 67 | result 68 | } 69 | //end::collectResults[] 70 | 71 | //tag::toRDD[] 72 | def toRDD(input: DataFrame): RDD[RawPanda] = { 73 | val rdd: RDD[Row] = input.rdd 74 | rdd.map(row => RawPanda(row.getAs[Long](0), row.getAs[String](1), 75 | row.getAs[String](2), row.getAs[Boolean](3), row.getAs[Array[Double]](4))) 76 | } 77 | //end::toRDD[] 78 | 79 | //tag::partitionedOutput[] 80 | def writeOutByZip(input: DataFrame): Unit = { 81 | input.write.partitionBy("zipcode").format("json").save("output/") 82 | } 83 | //end::partitionedOutput[] 84 | 85 | //tag::saveAppend[] 86 | def writeAppend(input: DataFrame): Unit = { 87 | input.write.mode(SaveMode.Append).save("output/") 88 | } 89 | //end::saveAppend[] 90 | 91 | def upsertPandas(input: DataFrame): Unit = { 92 | //tag::upsert[] 93 | input.mergeInto("pandaInfo", $"source.id" === $"target.id") 94 | .whenMatched() // Note you can override the general match condition above if desired 95 | .updateAll() 96 | .whenNotMatched() 97 | .insertAll() 98 | //end::upsert[] 99 | } 100 | 101 | def createJDBC() = { 102 | session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass", 103 | "table", new Properties) 104 | 105 | //tag::createJDBC[] 106 | session.read.format("jdbc") 107 | .option("url", "jdbc:dialect:serverName") 108 | .option("dbtable", "table").load() 109 | //end::createJDBC[] 110 | } 111 | 112 | def writeJDBC(df: DataFrame) = { 113 | df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass", 114 | "table", new Properties) 115 | 116 | //tag::writeJDBC[] 117 | df.write.format("jdbc") 118 | .option("url", "jdbc:dialect:serverName") 119 | .option("user", "user") 120 | .option("password", "pass") 121 | .option("dbtable", "table").save() 122 | //end::writeJDBC[] 123 | } 124 | 125 | //tag::loadParquet[] 126 | def loadParquet(path: String): DataFrame = { 127 | // Configure Spark to read binary data as string, 128 | // note: must be configured on session. 129 | session.conf.set("spark.sql.parquet.binaryAsString", "true") 130 | 131 | // Load parquet data using merge schema (configured through option) 132 | session.read 133 | .option("mergeSchema", "true") 134 | .format("parquet") 135 | .load(path) 136 | } 137 | //end::loadParquet[] 138 | 139 | //tag::writeParquet[] 140 | def writeParquet(df: DataFrame, path: String) = { 141 | df.write.format("parquet").save(path) 142 | } 143 | //end::writeParquet[] 144 | 145 | //tag::loadHiveTable[] 146 | def loadHiveTable(): DataFrame = { 147 | session.read.table("pandas") 148 | } 149 | //end::loadHiveTable[] 150 | 151 | //tag::saveManagedTable[] 152 | def saveManagedTable(df: DataFrame): Unit = { 153 | df.write.saveAsTable("pandas") 154 | } 155 | //end::saveManagedTable[] 156 | } 157 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.mllib 2 | 3 | import scala.collection.Map 4 | 5 | import org.apache.spark._ 6 | import org.apache.spark.mllib.classification.LogisticRegressionModel 7 | import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS 8 | import org.apache.spark.mllib.feature._ 9 | import org.apache.spark.mllib.linalg.Vectors 10 | import org.apache.spark.mllib.linalg.{Vector => SparkVector} 11 | import org.apache.spark.mllib.regression.LabeledPoint 12 | import org.apache.spark.rdd.RDD 13 | 14 | import com.highperformancespark.examples.dataframe._ 15 | //end::imports[] 16 | 17 | object GoldilocksMLlib { 18 | 19 | def booleanToDouble(boolean: Boolean): Double = { 20 | if (boolean) 1.0 else 0.0 21 | } 22 | 23 | def toLabeledPointDense(rdd: RDD[RawPanda]): RDD[LabeledPoint] = { 24 | //tag::toLabeledPointDense[] 25 | rdd.map(rp => 26 | LabeledPoint(booleanToDouble(rp.happy), 27 | Vectors.dense(rp.attributes))) 28 | //end::toLabeledPointDense[] 29 | } 30 | 31 | //tag::toSparkVectorDense[] 32 | def toSparkVectorDense(input: Array[Double]) = { 33 | Vectors.dense(input) 34 | } 35 | //end::toSparkVectorDense[] 36 | 37 | //tag::selectTopTen[] 38 | def selectTopTenFeatures(rdd: RDD[LabeledPoint]): 39 | (ChiSqSelectorModel, Array[Int], RDD[SparkVector]) = { 40 | val selector = new ChiSqSelector(10) 41 | val model = selector.fit(rdd) 42 | val topFeatures = model.selectedFeatures 43 | val vecs = rdd.map(_.features) 44 | (model, topFeatures, model.transform(vecs)) 45 | } 46 | //end::selectTopTen[] 47 | 48 | //tag::keepLabeled[] 49 | def selectAndKeepLabeled(rdd: RDD[LabeledPoint]): RDD[LabeledPoint] = { 50 | val selector = new ChiSqSelector(10) 51 | val model = selector.fit(rdd) 52 | rdd.map{ 53 | case LabeledPoint(label, features) => 54 | LabeledPoint(label, model.transform(features)) 55 | } 56 | } 57 | //end::keepLabeled[] 58 | 59 | //tag::createLabelLookup[] 60 | def createLabelLookup[T](rdd: RDD[T]): Map[T, Double] = { 61 | val distinctLabels: Array[T] = rdd.distinct().collect() 62 | distinctLabels.zipWithIndex 63 | .map{case (label, x) => (label, x.toDouble)}.toMap 64 | } 65 | //end::createLabelLookup[] 66 | 67 | 68 | //tag::hashingTFSimple[] 69 | def hashingTf(rdd: RDD[String]): RDD[SparkVector] = { 70 | val ht = new HashingTF() 71 | val tokenized = rdd.map(_.split(" ").toIterable) 72 | ht.transform(tokenized) 73 | } 74 | //end::hashingTFSimple[] 75 | 76 | //tag::word2vecTrain[] 77 | def word2vecTrain(rdd: RDD[String]): Word2VecModel = { 78 | // Tokenize our data 79 | val tokenized = rdd.map(_.split(" ").toIterable) 80 | // Construct our word2vec model 81 | val wv = new Word2Vec() 82 | wv.fit(tokenized) 83 | } 84 | //end::word2vecTrain[] 85 | 86 | 87 | //tag::trainScaler[] 88 | // Trains a feature scaler and returns the scaler and scaled features 89 | def trainScaler(rdd: RDD[SparkVector]): (StandardScalerModel, RDD[SparkVector]) = { 90 | val scaler = new StandardScaler() 91 | val scalerModel = scaler.fit(rdd) 92 | (scalerModel, scalerModel.transform(rdd)) 93 | } 94 | //end::trainScaler[] 95 | 96 | //tag::hashingTFPreserve[] 97 | def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = { 98 | val ht = new HashingTF() 99 | rdd.map{panda => 100 | val textField = panda.pt 101 | val tokenizedTextField = textField.split(" ").toIterable 102 | (panda, ht.transform(tokenizedTextField)) 103 | } 104 | } 105 | //end::hashingTFPreserve[] 106 | 107 | //tag::hashingTFPreserveZip[] 108 | def hashingTFPreserveZip(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = { 109 | val ht = new HashingTF() 110 | val tokenized = rdd.map{panda => panda.pt.split(" ").toIterable} 111 | val vecs = ht.transform(tokenized) 112 | rdd.zip(vecs) 113 | } 114 | //end::hashingTFPreserveZip[] 115 | 116 | //tag::toLabeledPointWithHashing[] 117 | def toLabeledPointWithHashing(rdd: RDD[RawPanda]): RDD[LabeledPoint] = { 118 | val ht = new HashingTF() 119 | rdd.map{rp => 120 | val hashingVec = ht.transform(rp.pt) 121 | val combined = hashingVec.toArray ++ rp.attributes 122 | LabeledPoint(booleanToDouble(rp.happy), 123 | Vectors.dense(combined)) 124 | } 125 | } 126 | //end::toLabeledPointWithHashing[] 127 | 128 | //tag::train[] 129 | def trainModel(rdd: RDD[LabeledPoint]): LogisticRegressionModel = { 130 | val lr = new LogisticRegressionWithLBFGS() 131 | val lrModel = lr.run(rdd) 132 | lrModel 133 | } 134 | //end::train[] 135 | 136 | //tag::trainWithIntercept[] 137 | def trainModelWithInterept(rdd: RDD[LabeledPoint]): LogisticRegressionModel = { 138 | val lr = new LogisticRegressionWithLBFGS() 139 | lr.setIntercept(true) 140 | val lrModel = lr.run(rdd) 141 | lrModel 142 | } 143 | //end::trainWithIntercept[] 144 | 145 | //tag::predict[] 146 | def predict(model: LogisticRegressionModel, rdd: RDD[SparkVector]): RDD[Double] = { 147 | model.predict(rdd) 148 | } 149 | //end::predict[] 150 | 151 | //tag::save[] 152 | def save(sc: SparkContext, path: String, model: LogisticRegressionModel) = { 153 | //tag::savePMML[] 154 | // Save to PMML - remote path 155 | model.toPMML(sc, path + "/pmml") 156 | // Save to PMML local path 157 | model.toPMML(path + "/pmml") 158 | //end::savePMML[] 159 | //tag::saveInternal[] 160 | // Save to internal - remote path 161 | model.save(sc, path + "/internal") 162 | //end::saveInternal[] 163 | } 164 | //end::save[] 165 | 166 | //tag::load[] 167 | def load(sc: SparkContext, path: String): LogisticRegressionModel = { 168 | LogisticRegressionModel.load(sc, path + "/internal") 169 | } 170 | //end::load[] 171 | } 172 | -------------------------------------------------------------------------------- /core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala: -------------------------------------------------------------------------------- 1 | package com.highperformancespark.examples.ml 2 | 3 | import scala.collection.Map 4 | 5 | import org.apache.spark._ 6 | import org.apache.spark.ml._ 7 | import org.apache.spark.ml.classification._ 8 | import org.apache.spark.ml.linalg._ 9 | import org.apache.spark.ml.param._ 10 | import org.apache.spark.ml.util.Identifiable 11 | import org.apache.spark.rdd.RDD 12 | import org.apache.spark.sql._ 13 | import org.apache.spark.sql._ 14 | import org.apache.spark.sql.functions._ 15 | import org.apache.spark.sql.types._ 16 | 17 | import com.highperformancespark.examples.dataframe._ 18 | //end::extraImports[] 19 | 20 | //tag::basicPipelineSetup[] 21 | class HardCodedWordCountStage(override val uid: String) extends Transformer { 22 | def this() = this(Identifiable.randomUID("hardcodedwordcount")) 23 | 24 | def copy(extra: ParamMap): HardCodedWordCountStage = { 25 | defaultCopy(extra) 26 | } 27 | //end::basicPipelineSetup[] 28 | 29 | //tag::basicTransformSchema[] 30 | override def transformSchema(schema: StructType): StructType = { 31 | // Check that the input type is a string 32 | val idx = schema.fieldIndex("happy_pandas") 33 | val field = schema.fields(idx) 34 | if (field.dataType != StringType) { 35 | throw new Exception( 36 | s"Input type ${field.dataType} did not match input type StringType") 37 | } 38 | // Add the return field 39 | schema.add(StructField("happy_panda_counts", IntegerType, false)) 40 | } 41 | //end::basicTransformSchema[] 42 | 43 | //tag::transformFunction[] 44 | def transform(df: Dataset[_]): DataFrame = { 45 | val wordcount = udf { in: String => in.split(" ").size } 46 | df.select(col("*"), 47 | wordcount(df.col("happy_pandas")).as("happy_panda_counts")) 48 | } 49 | //end::transformFunction[] 50 | } 51 | 52 | 53 | //tag::paramTransformer[] 54 | class ConfigurableWordCount(override val uid: String) extends Transformer { 55 | final val inputCol= new Param[String](this, "inputCol", "The input column") 56 | final val outputCol = new Param[String](this, "outputCol", "The output column") 57 | 58 | def setInputCol(value: String): this.type = set(inputCol, value) 59 | 60 | def setOutputCol(value: String): this.type = set(outputCol, value) 61 | 62 | def this() = this(Identifiable.randomUID("configurablewordcount")) 63 | 64 | def copy(extra: ParamMap): HardCodedWordCountStage = { 65 | defaultCopy(extra) 66 | } 67 | 68 | override def transformSchema(schema: StructType): StructType = { 69 | // Check that the input type is a string 70 | val idx = schema.fieldIndex($(inputCol)) 71 | val field = schema.fields(idx) 72 | if (field.dataType != StringType) { 73 | throw new Exception( 74 | s"Input type ${field.dataType} did not match input type StringType") 75 | } 76 | // Add the return field 77 | schema.add(StructField($(outputCol), IntegerType, false)) 78 | } 79 | 80 | def transform(df: Dataset[_]): DataFrame = { 81 | val wordcount = udf { in: String => in.split(" ").size } 82 | df.select(col("*"), wordcount(df.col($(inputCol))).as($(outputCol))) 83 | } 84 | } 85 | //end::paramTransformer[] 86 | 87 | 88 | //tag::simpleIndexer[] 89 | trait SimpleIndexerParams extends Params { 90 | final val inputCol= new Param[String](this, "inputCol", "The input column") 91 | final val outputCol = new Param[String](this, "outputCol", "The output column") 92 | } 93 | 94 | class SimpleIndexer(override val uid: String) 95 | extends Estimator[SimpleIndexerModel] with SimpleIndexerParams { 96 | 97 | def setInputCol(value: String) = set(inputCol, value) 98 | 99 | def setOutputCol(value: String) = set(outputCol, value) 100 | 101 | def this() = this(Identifiable.randomUID("simpleindexer")) 102 | 103 | override def copy(extra: ParamMap): SimpleIndexer = { 104 | defaultCopy(extra) 105 | } 106 | 107 | override def transformSchema(schema: StructType): StructType = { 108 | // Check that the input type is a string 109 | val idx = schema.fieldIndex($(inputCol)) 110 | val field = schema.fields(idx) 111 | if (field.dataType != StringType) { 112 | throw new Exception( 113 | s"Input type ${field.dataType} did not match input type StringType") 114 | } 115 | // Add the return field 116 | schema.add(StructField($(outputCol), IntegerType, false)) 117 | } 118 | 119 | override def fit(dataset: Dataset[_]): SimpleIndexerModel = { 120 | import dataset.sparkSession.implicits._ 121 | val words = dataset.select(dataset($(inputCol)).as[String]).distinct 122 | .collect() 123 | // Construct the model 124 | val model = new SimpleIndexerModel(uid, words) 125 | // Copy the parameters to the model 126 | copyValues(model) 127 | } 128 | } 129 | 130 | class SimpleIndexerModel(override val uid: String, words: Array[String]) 131 | extends Model[SimpleIndexerModel] with SimpleIndexerParams { 132 | 133 | override def copy(extra: ParamMap): SimpleIndexerModel = { 134 | defaultCopy(extra) 135 | } 136 | 137 | private val labelToIndex: Map[String, Double] = words.zipWithIndex. 138 | map{case (x, y) => (x, y.toDouble)}.toMap 139 | 140 | override def transformSchema(schema: StructType): StructType = { 141 | // Check that the input type is a string 142 | val idx = schema.fieldIndex($(inputCol)) 143 | val field = schema.fields(idx) 144 | if (field.dataType != StringType) { 145 | throw new Exception( 146 | s"Input type ${field.dataType} did not match input type StringType") 147 | } 148 | // Add the return field 149 | schema.add(StructField($(outputCol), IntegerType, false)) 150 | } 151 | 152 | override def transform(dataset: Dataset[_]): DataFrame = { 153 | val indexer = udf { label: String => labelToIndex(label) } 154 | dataset.select(col("*"), 155 | indexer(dataset($(inputCol)).cast(StringType)).as($(outputCol))) 156 | } 157 | } 158 | //end::SimpleIndexer[] 159 | --------------------------------------------------------------------------------