├── c
├── python
    ├── examples
    │   ├── bad_pyspark.py.fail
    │   ├── __init__.py
    │   ├── spark_expectations_example.py.fail
    │   ├── test_load_previous_run_data.py
    │   ├── dual_write.py
    │   ├── test_dual_write.py
    │   ├── load_previous_run_data.py
    │   ├── test_dual_write_new.py
    │   ├── pandera_ex.py
    │   ├── SQLLineage.py
    │   ├── udf.py
    │   ├── spark_expectations_example.py
    │   ├── bad_pyspark.py
    │   └── simple_perf.py
    ├── .flake8
    ├── README.md
    ├── pyproject.toml
    ├── requirements.txt
    ├── setup.cfg
    └── tox.ini
├── project
    ├── build.properties
    └── plugins.sbt
├── sql
    ├── iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail
    ├── partioned_table_join.sql.conf
    ├── nonpartitioned_table_join.sql.conf
    ├── nonpartitioned_table_join.sql
    ├── gluten_only_nonpartitioned_table_join.sql
    ├── partioned_table_join.sql
    ├── iceberg-schema-evolution-gotcha-possibility.sql
    ├── iceberg-schema-evolution-gotcha-workaround.sql
    └── wap.sql
├── .jvmopts
├── native
    └── src
    │   ├── c
    │       ├── sum.h
    │       ├── sum.c
    │       ├── sumf_wrapper.c
    │       ├── sum_wrapper.c
    │       ├── include
    │       │   └── com_highperformancespark_examples_ffi_SumJNI.h
    │       └── gluten
    │       │   └── GlutenUDF.cpp
    │   ├── fortran
    │       └── sumf.f95
    │   └── CMakeLists.txt
├── resources
    ├── rawpanda.json
    └── mysql-connector-java-5.1.38.jar
├── data
    └── project.csv
├── core
    └── src
    │   ├── main
    │       ├── julia
    │       │   ├── setup.jl
    │       │   └── wc.jl
    │       ├── perl
    │       │   ├── Changes
    │       │   ├── MANIFEST
    │       │   ├── ignore.txt
    │       │   ├── t
    │       │   │   ├── 00-load.t
    │       │   │   ├── manifest.t
    │       │   │   ├── pod.t
    │       │   │   └── pod-coverage.t
    │       │   ├── ghinfo.pl
    │       │   ├── Makefile.PL
    │       │   ├── xt
    │       │   │   └── boilerplate.t
    │       │   ├── README
    │       │   └── lib
    │       │   │   └── HighPerformanceSpark
    │       │   │       └── Examples.pm
    │       ├── java
    │       │   └── com
    │       │   │   └── highperformancespark
    │       │   │       └── examples
    │       │   │           ├── ffi
    │       │   │               └── SumJNIJava.java
    │       │   │           ├── objects
    │       │   │               ├── JavaCoffeeShop.java
    │       │   │               ├── JavaPandaPlace.java
    │       │   │               ├── JavaPandas.java
    │       │   │               ├── JavaPandaInfo.java
    │       │   │               └── JavaRawPanda.java
    │       │   │           ├── WordCount.java
    │       │   │           ├── JavaInterop.java
    │       │   │           └── dataframe
    │       │   │               ├── JavaUDFs.java
    │       │   │               └── JavaLoadSave.java
    │       ├── scala
    │       │   └── com
    │       │   │   └── high-performance-spark-examples
    │       │   │       ├── native
    │       │   │           ├── SumJNA.scala
    │       │   │           ├── NativeExample.scala
    │       │   │           ├── SumJNI.scala
    │       │   │           ├── StandAlone.scala
    │       │   │           ├── SumFJNA.scala
    │       │   │           └── PipeExample.scala
    │       │   │       ├── ml
    │       │   │           ├── SimpleExport.scala
    │       │   │           └── CustomPipeline.scala
    │       │   │       ├── tokenize
    │       │   │           └── SampleTokenize.scala
    │       │   │       ├── dataframe
    │       │   │           ├── SQLExtension.scala
    │       │   │           ├── RegularSQL.scala
    │       │   │           ├── NullabilityFilterOptimizer.scala
    │       │   │           ├── RawPandas.scala
    │       │   │           ├── UDFs.scala
    │       │   │           ├── MixedDataset.scala_back
    │       │   │           ├── MixedDataset.scala
    │       │   │           └── LoadSave.scala
    │       │   │       ├── streaming
    │       │   │           ├── Structured.scala
    │       │   │           └── DStream.scala
    │       │   │       ├── transformations
    │       │   │           ├── NarrowAndWide.scala
    │       │   │           └── Accumulators.scala
    │       │   │       ├── tools
    │       │   │           ├── ResourceProfileEx.scala
    │       │   │           ├── SampleData.scala
    │       │   │           ├── FilterInvalidPandas.scala
    │       │   │           └── GenerateScalingData.scala
    │       │   │       ├── wordcount
    │       │   │           └── WordCount.scala
    │       │   │       ├── errors
    │       │   │           └── throws.scala
    │       │   │       ├── perf
    │       │   │           └── SimplePerfTest.scala
    │       │   │       ├── goldilocks
    │       │   │           └── RDDJoinExamples.scala
    │       │   │       └── mllib
    │       │   │           └── GoldilocksMLlib.scala
    │       └── r
    │       │   ├── dapply.R
    │       │   └── wc.R
    │   └── test
    │       ├── scala
    │           └── com
    │           │   ├── highperformancespark
    │           │       └── examples
    │           │       │   └── JavaInteropHelper.scala
    │           │   └── high-performance-spark-examples
    │           │       ├── native
    │           │           ├── PipeExampleSuite.scala
    │           │           └── NativeExample.scala
    │           │       ├── errors
    │           │           └── ThrowsSuite.scala
    │           │       ├── goldilocks
    │           │           ├── JoinTest.scala
    │           │           ├── SortingTests.scala
    │           │           ├── EvaluationTests.scala
    │           │           └── GoldilocksLargeTests.scala
    │           │       ├── streaming
    │           │           └── DStreamSuite.scala
    │           │       ├── tokenize
    │           │           └── SampleTokenizeSuite.scala
    │           │       ├── ml
    │           │           ├── CustomPipeline.scala
    │           │           └── SimpleNaiveBayes.scala
    │           │       ├── mllib
    │           │           └── GoldilocksMLlibSuite.scala
    │           │       ├── tools
    │           │           ├── FilterInvalidPandasSuite.scala
    │           │           └── GenerateScalingDataSuite.scala
    │           │       ├── transformations
    │           │           └── Accumulators.scala
    │           │       ├── dataframe
    │           │           ├── PandaPlaceFilterPushdown.scala
    │           │           └── SQLExtensionTest.scala
    │           │       └── wordcount
    │           │           └── WordCountTest.scala
    │       └── java
    │           └── com
    │               └── highperformancespark
    │                   └── examples
    │                       └── JavaInteropTest.java
├── Dockerfile
├── accelerators
    ├── run_gluten.sh
    ├── install_rust_if_needed.sh
    ├── gluten_config.properties
    ├── comet_ex.sh
    ├── setup_gluten_deps.sh
    ├── setup_gluten_from_src.sh
    ├── gluten_spark_34_ex.sh
    ├── comet_env_setup.sh
    ├── setup_comet.sh
    ├── gluten_env_setup.sh
    └── setup_gluten_spark34.sh
├── migration
    └── sql.sh
├── shell-scripts
    └── launch-with-mysql-jdbc
├── conf
    ├── sbtconfig.txt
    └── log4j.properties
├── misc
    ├── container_launch.sh
    └── kernel.json
├── run_container.sh
├── se_simple.json
├── .scalafix.conf
├── appveyor.yml
├── target-validator
    ├── runme.sh
    └── ex.yaml
├── high_performance_pyspark
    └── __init__.py
├── se_complex.json
├── README.md
├── .gitignore
├── run_sql_examples.sh
├── env_setup.sh
├── run_pyspark_examples.sh
└── Dockerfile-mini


/c:
--------------------------------------------------------------------------------
1 | bloop
2 | 
3 | 


--------------------------------------------------------------------------------
/python/examples/bad_pyspark.py.fail:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=1.9.9
2 | 


--------------------------------------------------------------------------------
/python/.flake8:
--------------------------------------------------------------------------------
1 | [flake8]
2 | max-line-length = 120


--------------------------------------------------------------------------------
/python/examples/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = 0.2
2 | 


--------------------------------------------------------------------------------
/python/examples/spark_expectations_example.py.fail:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/python/README.md:
--------------------------------------------------------------------------------
1 | Python examples for High Performance Spark
2 | 


--------------------------------------------------------------------------------
/sql/iceberg-schema-evolution-gotcha-possibility.sql.expected_to_fail:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/.jvmopts:
--------------------------------------------------------------------------------
1 |  -Xms4096M
2 |  -Xmx8096M
3 |  -Xss2M
4 |  -XX:MaxMetaspaceSize=4024M


--------------------------------------------------------------------------------
/native/src/c/sum.h:
--------------------------------------------------------------------------------
1 | #ifndef _SUM_H
2 | #define _SUM_H
3 | 
4 | int sum(int input[], int num_elem);
5 | 
6 | #endif /* _SUM_H */
7 | 


--------------------------------------------------------------------------------
/resources/rawpanda.json:
--------------------------------------------------------------------------------
1 | {"name":"mission","pandas":[{"id":1,"zip":"94110","pt":"giant", "happy":true,
2 | 			     "attributes":[0.4,0.5]}]}
3 | 


--------------------------------------------------------------------------------
/data/project.csv:
--------------------------------------------------------------------------------
1 | creator,projectname,stars
2 | holdenk,spark-upgrade,17
3 | krisnova,rust-nova,71
4 | kbendick,MongoMart,6
5 | mateiz,spark,36600


--------------------------------------------------------------------------------
/native/src/fortran/sumf.f95:
--------------------------------------------------------------------------------
1 |        INTEGER FUNCTION SUMF(N,A) BIND(C, NAME='sumf')
2 |        INTEGER A(N)
3 |        SUMF=SUM(A)
4 |        END
5 | 


--------------------------------------------------------------------------------
/core/src/main/julia/setup.jl:
--------------------------------------------------------------------------------
1 | Pkg.clone("https://github.com/dfdx/Spark.jl")
2 | Pkg.build("Spark")
3 | # we also need latest master of JavaCall.jl
4 | Pkg.checkout("JavaCall")


--------------------------------------------------------------------------------
/resources/mysql-connector-java-5.1.38.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/high-performance-spark/high-performance-spark-examples/HEAD/resources/mysql-connector-java-5.1.38.jar


--------------------------------------------------------------------------------
/core/src/main/perl/Changes:
--------------------------------------------------------------------------------
1 | Revision history for HighPerformanceSpark-Examples
2 | 
3 | 0.01    Date/time
4 |         First version, released on an unsuspecting world.
5 | 
6 | 


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | ARG base
2 | FROM $base
3 | 
4 | USER root
5 | RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro fastavro
6 | USER dev
7 | RUN sbt clean compile
8 | 


--------------------------------------------------------------------------------
/native/src/c/sum.c:
--------------------------------------------------------------------------------
 1 | #include "sum.h"
 2 | 
 3 | int sum(int input[], int num_elem) {
 4 |   int c, ret = 0;
 5 |   for (c = 0; c < num_elem; c++) {
 6 |     ret += input[c];
 7 |   }
 8 |   return ret;
 9 | }
10 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools >= 58.0"]
3 | build-backend = "setuptools.build_meta"
4 | 
5 | [[tool.mypy.overrides]]
6 | module = "examples"
7 | ignore_missing_imports = true
8 | 


--------------------------------------------------------------------------------
/core/src/main/perl/MANIFEST:
--------------------------------------------------------------------------------
 1 | Changes
 2 | lib/HighPerformanceSpark/Examples.pm
 3 | Makefile.PL
 4 | MANIFEST			This list of files
 5 | README
 6 | t/00-load.t
 7 | t/manifest.t
 8 | t/pod-coverage.t
 9 | t/pod.t
10 | 


--------------------------------------------------------------------------------
/python/requirements.txt:
--------------------------------------------------------------------------------
 1 | spark-testing-base
 2 | pandas
 3 | pyarrow
 4 | pyspark==3.5.0
 5 | pyspark-asyncactions
 6 | pandera
 7 | pandera[pyspark]
 8 | spark-expectations>=1.0
 9 | venv-pack
10 | requests
11 | numpy<2.0
12 | 


--------------------------------------------------------------------------------
/accelerators/run_gluten.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | "${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar"  --spark-properties=gluten_config.properties
4 | 


--------------------------------------------------------------------------------
/native/src/c/sumf_wrapper.c:
--------------------------------------------------------------------------------
1 | // Fortran routine
2 | extern int sumf(int *, int[]);
3 | 
4 | // Call the fortran code which expects by reference size
5 | int wrap_sum(int input[], int size) {
6 |   return sumf(&size, input);
7 | }
8 | 


--------------------------------------------------------------------------------
/core/src/main/julia/wc.jl:
--------------------------------------------------------------------------------
1 | using Spark
2 | sc = SparkContext(master="local")
3 | path = string("file:///", ENV["SPARK_HOME"], "/README.md")
4 | txt = text_file(sc, path)
5 | # Normally we would use a flatmap, but currently only has map_partitions
6 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/ffi/SumJNIJava.java:
--------------------------------------------------------------------------------
1 | package com.highperformancespark.examples.ffi;
2 | 
3 | // tag::sumJNIJava[]
4 | class SumJNIJava {
5 |   public static native Integer sum(Integer[] array);
6 | }
7 | // end::sumJNIJava[]
8 | 


--------------------------------------------------------------------------------
/migration/sql.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | pip install sqlfluff
4 | python -m pip install 'sqlfluff-plugin-sparksql-upgrade @ git+https://github.com/holdenk/spark-upgrade#subdirectory=sql'
5 | 
6 | sqlfluff rules |grep -i spark
7 | sqlfluff fix --dialect sparksql  farts.sql
8 | 


--------------------------------------------------------------------------------
/shell-scripts/launch-with-mysql-jdbc:
--------------------------------------------------------------------------------
1 | ASSEMBLY_JAR=./target/scala-2.10/examples_2.10.jar
2 | CLASS="com.highperformancespark.dataframe.mysqlload"
3 | #tag:[submit]
4 | spark-submit --jars ./resources/mysql-connector-java-5.1.38.jar $ASSEMBLY_JAR $CLASS 
5 | #end:[submit]


--------------------------------------------------------------------------------
/accelerators/install_rust_if_needed.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | if [ -f "$HOME/.cargo/env" ]; then
 3 |   source "$HOME/.cargo/env"
 4 | fi
 5 | 
 6 | if ! command -v cargo; then
 7 |   curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
 8 |   source "$HOME/.cargo/env"
 9 | fi
10 | 


--------------------------------------------------------------------------------
/accelerators/gluten_config.properties:
--------------------------------------------------------------------------------
1 | spark.plugins=io.glutenproject.GlutenPlugin
2 | spark.memory.offHeap.enabled=true
3 | spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager
4 | # This static allocation is one of the hardest part of using Gluten
5 | spark.memory.offHeap.size=20g
6 | 


--------------------------------------------------------------------------------
/conf/sbtconfig.txt:
--------------------------------------------------------------------------------
 1 | 
 2 | # Set the java args to high
 3 | 
 4 | -Xmx2048M
 5 | 
 6 | -XX:MaxPermSize=2048m
 7 | 
 8 | -XX:ReservedCodeCacheSize=128m
 9 | 
10 | -XX:+CMSClassUnloadingEnabled
11 | 
12 | # Set the extra SBT options
13 | 
14 | -Dsbt.log.format=true
15 | 
16 | # JNA
17 | 
18 | -Djna.nosys=true
19 | 


--------------------------------------------------------------------------------
/misc/container_launch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | if [ ! -f /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb ]; then
3 |   cp /high-performance-spark-examples/iceberg-workshop-solutions/Workshop-Template.ipynb /high-performance-spark-examples/iceberg-workshop/Workshop.ipynb
4 | fi
5 | jupyter-lab --ip 0.0.0.0 --port 8877
6 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/native/SumJNA.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.ffi
 2 | 
 3 | // tag::sumJNA[]
 4 | import com.sun.jna._
 5 | object SumJNA {
 6 |   Native.register("high-performance-spark0")
 7 |   @native def sum(n: Array[Int], size: Int): Int
 8 | }
 9 | // end::sumJNA[]
10 | 


--------------------------------------------------------------------------------
/core/src/main/perl/ignore.txt:
--------------------------------------------------------------------------------
 1 | Makefile
 2 | Makefile.old
 3 | Build
 4 | Build.bat
 5 | META.*
 6 | MYMETA.*
 7 | .build/
 8 | _build/
 9 | cover_db/
10 | blib/
11 | inc/
12 | .lwpcookies
13 | .last_cover_stats
14 | nytprof.out
15 | pod2htm*.tmp
16 | pm_to_blib
17 | HighPerformanceSpark-Examples-*
18 | HighPerformanceSpark-Examples-*.tar.gz
19 | 


--------------------------------------------------------------------------------
/core/src/main/perl/t/00-load.t:
--------------------------------------------------------------------------------
 1 | #!perl -T
 2 | use 5.006;
 3 | use strict;
 4 | use warnings;
 5 | use Test::More;
 6 | 
 7 | plan tests => 1;
 8 | 
 9 | BEGIN {
10 |     use_ok( 'HighPerformanceSpark::Examples' ) || print "Bail out!\n";
11 | }
12 | 
13 | diag( "Testing HighPerformanceSpark::Examples $HighPerformanceSpark::Examples::VERSION, Perl $], $^X" );
14 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/native/NativeExample.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.ffi
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | object NativeExample {
 6 |   def jniSum(input: RDD[(String, Array[Int])]): RDD[(String, Int)] = {
 7 |     input.mapValues(values => new SumJNI().sum(values))
 8 |   }
 9 | }
10 | 


--------------------------------------------------------------------------------
/sql/partioned_table_join.sql.conf:
--------------------------------------------------------------------------------
1 | 	    --conf spark.sql.sources.v2.bucketing.enabled=true
2 | 	    --conf spark.sql.iceberg.planning.preserve-data-grouping=true
3 | 	    --conf spark.sql.requireAllClusterKeysForCoPartition=false
4 | 
5 | 	    --conf spark.sql.adaptive.enabled=false
6 | 	    --conf spark.sql.autoBroadcastJoinThreshold=-1
7 | 	    --conf spark.sql.shuffle.partitions=4
8 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/highperformancespark/examples/JavaInteropHelper.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples
 2 | 
 3 | 
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | class JavaInteropTestHelper(sc: SparkContext) {
 8 |   def generateMiniPairRDD(): RDD[(String, Long)] = {
 9 |     sc.parallelize(List(("panda", 12L)))
10 |   }
11 | }
12 | 


--------------------------------------------------------------------------------
/sql/nonpartitioned_table_join.sql.conf:
--------------------------------------------------------------------------------
1 | 	    --conf spark.sql.sources.v2.bucketing.enabled=true
2 | 	    --conf spark.sql.iceberg.planning.preserve-data-grouping=true
3 | 	    --conf spark.sql.requireAllClusterKeysForCoPartition=false
4 | 
5 | 	    --conf spark.sql.adaptive.enabled=false
6 | 	    --conf spark.sql.autoBroadcastJoinThreshold=-1
7 | 	    --conf spark.sql.shuffle.partitions=4
8 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/native/SumJNI.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.ffi
 2 | 
 3 | import com.github.sbt.jni.nativeLoader
 4 | 
 5 | //tag::sumJNIDecorator[]
 6 | @nativeLoader("high-performance-spark0")
 7 | //end::sumJNIDecorator[]
 8 | // tag::sumJNI[]
 9 | class SumJNI {
10 |   @native def sum(n: Array[Int]): Int
11 | }
12 | // end::sumJNI[]
13 | 


--------------------------------------------------------------------------------
/accelerators/comet_ex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | # If you change this update the workflow version too.
 5 | SPARK_MAJOR=${SPARK_MAJOR:-3.5}
 6 | SPARK_VERSION=${SPARK_MAJOR}.1
 7 | export SPARK_MAJOR
 8 | export SPARK_VERSION
 9 | 
10 | source setup_comet.sh
11 | pushd ..
12 | source ./env_setup.sh 
13 | popd
14 | source comet_env_setup.sh
15 | pushd ..
16 | USE_COMET="true" ./run_sql_examples.sh
17 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/ml/SimpleExport.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.ml
 2 | 
 3 | import org.apache.spark.ml.classification._
 4 | 
 5 | object SimpleExport {
 6 |   //tag::exportLR[]
 7 |   def exportLRToCSV(model: LogisticRegressionModel) = {
 8 |     (model.coefficients.toArray :+ model.intercept).mkString(",")
 9 |   }
10 |   //end::exportLR[]
11 | }
12 | 


--------------------------------------------------------------------------------
/core/src/main/perl/t/manifest.t:
--------------------------------------------------------------------------------
 1 | #!perl -T
 2 | use 5.006;
 3 | use strict;
 4 | use warnings;
 5 | use Test::More;
 6 | 
 7 | unless ( $ENV{RELEASE_TESTING} ) {
 8 |     plan( skip_all => "Author tests not required for installation" );
 9 | }
10 | 
11 | my $min_tcm = 0.9;
12 | eval "use Test::CheckManifest $min_tcm";
13 | plan skip_all => "Test::CheckManifest $min_tcm required" if $@;
14 | 
15 | ok_manifest();
16 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/native/StandAlone.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.ffi
 2 | 
 3 | object StandAlone {
 4 |   // $COVERAGE-OFF$
 5 |   def main(args: Array[String]) {
 6 |     //tag::systemLoadLibrary[]
 7 |     System.loadLibrary("highPerformanceSpark0")
 8 |     //end::systemLoadLibrary[]
 9 |     println(new SumJNI().sum(Array(1,2,3)))
10 |   }
11 |    // $COVERAGE-ON$
12 | }
13 | 


--------------------------------------------------------------------------------
/core/src/main/perl/t/pod.t:
--------------------------------------------------------------------------------
 1 | #!perl -T
 2 | use 5.006;
 3 | use strict;
 4 | use warnings;
 5 | use Test::More;
 6 | 
 7 | unless ( $ENV{RELEASE_TESTING} ) {
 8 |     plan( skip_all => "Author tests not required for installation" );
 9 | }
10 | 
11 | # Ensure a recent version of Test::Pod
12 | my $min_tp = 1.22;
13 | eval "use Test::Pod $min_tp";
14 | plan skip_all => "Test::Pod $min_tp required for testing POD" if $@;
15 | 
16 | all_pod_files_ok();
17 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/native/SumFJNA.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.ffi
 2 | 
 3 | // tag::sumFJNA[]
 4 | import com.sun.jna._
 5 | import com.sun.jna.ptr._
 6 | object SumFJNA {
 7 |   Native.register("high-performance-spark0")
 8 |   @native def sumf(n: IntByReference, a: Array[Int]): Int
 9 |   def easySum(size: Int, a: Array[Int]): Int = {
10 |     val ns = new IntByReference(size)
11 |     sumf(ns, a)
12 |   }
13 | }
14 | // end::sumFJNA[]
15 | 


--------------------------------------------------------------------------------
/run_container.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | VERSION=${VERSION:-0.5}
 4 | IMAGE=${IMAGE:-holdenk/hps:$VERSION}
 5 | export VERSION
 6 | export IMAGE
 7 | docker image pull "$IMAGE"
 8 | mkdir -p warehouse
 9 | mkdir -p iceberg-workshop
10 | docker container  run --mount type=bind,source="$(pwd)"/warehouse,target=/high-performance-spark-examples/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -p 4040:4040 -it "${IMAGE}" # /bin/bash
11 | 


--------------------------------------------------------------------------------
/python/examples/test_load_previous_run_data.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.session import SparkSession
 2 | import os
 3 | import tempfile
 4 | 
 5 | from sparktestingbase.sqltestcase import SQLTestCase
 6 | from .load_previous_run_data import LoadPreviousRunData
 7 | 
 8 | 
 9 | class TestLoadPreviousRunData(SQLTestCase):
10 |     def test_do_magic(self):
11 |         lprd = LoadPreviousRunData(self.session)
12 |         try:
13 |             lprd.do_magic()
14 |         except FileNotFoundError:
15 |             print("No previous jobs")
16 | 


--------------------------------------------------------------------------------
/se_simple.json:
--------------------------------------------------------------------------------
1 | {"product_id": "pay", "table_name": "local.fake_table_name", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1}
2 | 


--------------------------------------------------------------------------------
/native/src/c/sum_wrapper.c:
--------------------------------------------------------------------------------
 1 | #include "sum.h"
 2 | #include "include/com_highperformancespark_examples_ffi_SumJNI.h"
 3 | #include <ctype.h>
 4 | #include <jni.h>
 5 | 
 6 | /*
 7 |  * Class:     com_highperformancespark_examples_ffi_SumJNI
 8 |  * Method:    sum
 9 |  * Signature: ([I)I
10 |  */
11 | JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum
12 | (JNIEnv *env, jobject obj, jintArray ja) {
13 |   jsize size = (*env)->GetArrayLength(env, ja);
14 |   jint *a = (*env)->GetIntArrayElements(env, ja, 0);
15 |   return sum(a, size);
16 | }
17 | 


--------------------------------------------------------------------------------
/sql/nonpartitioned_table_join.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS local.udevelopers (
 2 |        username string,
 3 |        firstname string,
 4 |        lastname string)
 5 | USING iceberg;
 6 | CREATE TABLE IF NOT EXISTS local.uprojects (
 7 |        creator string,
 8 |        uprojectname string)
 9 | USING iceberg;
10 | INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova");
11 | INSERT INTO local.uprojects VALUES("krisnova", "aurae");
12 | SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username;
13 | 


--------------------------------------------------------------------------------
/core/src/main/perl/ghinfo.pl:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/perl
 2 | use strict;
 3 | use warnings;
 4 | 
 5 | use Pithub;
 6 | use Data::Dumper;
 7 | 
 8 | # Find all of the commentors on an issue
 9 | my $user = $ENV{'user'};
10 | my $repo = $ENV{'repo'};
11 | my $p = Pithub->new(user => $user, repo => $repo);
12 | while (my $id = <>) {
13 |     chomp ($id);
14 |     my $issue_comments = $p->issues->comments->list(issue_id => $id);
15 |     print $id;
16 |     while (my $comment = $issue_comments->next) {
17 | 	print " ".$comment->{"user"}->{"login"};
18 |     }
19 |     print "\n";
20 | }
21 | 


--------------------------------------------------------------------------------
/sql/gluten_only_nonpartitioned_table_join.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS local.udevelopers (
 2 |        username string,
 3 |        firstname string,
 4 |        lastname string)
 5 | USING iceberg;
 6 | CREATE TABLE IF NOT EXISTS local.uprojects (
 7 |        creator string,
 8 |        uprojectname string)
 9 | USING iceberg;
10 | INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova");
11 | INSERT INTO local.uprojects VALUES("krisnova", "aurae");
12 | SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username;
13 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/tokenize/SampleTokenize.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.tokenize
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | object SampleTokenize {
 6 |   //tag::DIFFICULT[]
 7 |   def difficultTokenizeRDD(input: RDD[String]) = {
 8 |     input.flatMap(_.split(" "))
 9 |   }
10 |   //end::DIFFICULT[]
11 | 
12 |   //tag::EASY[]
13 |   def tokenizeRDD(input: RDD[String]) = {
14 |     input.flatMap(tokenize)
15 |   }
16 | 
17 |   protected[tokenize] def tokenize(input: String) = {
18 |     input.split(" ")
19 |   }
20 |   //end::EASY[]
21 | }
22 | 


--------------------------------------------------------------------------------
/sql/partioned_table_join.sql:
--------------------------------------------------------------------------------
 1 | CREATE TABLE IF NOT EXISTS local.developers (
 2 |        username string,
 3 |        firstname string,
 4 |        lastname string)
 5 | USING iceberg
 6 | PARTITIONED BY (username);
 7 | CREATE TABLE IF NOT EXISTS local.projects (
 8 |        creator string,
 9 |        projectname string)
10 | USING iceberg
11 | PARTITIONED BY (creator);
12 | INSERT INTO local.developers VALUES("krisnova", "Kris", "Nova");
13 | INSERT INTO local.projects VALUES("krisnova", "aurae");
14 | SELECT * FROM local.developers INNER JOIN local.projects ON local.projects.creator = local.developers.username;
15 | 


--------------------------------------------------------------------------------
/accelerators/setup_gluten_deps.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | sudo apt-get update
 5 | #tag::gluten_deps[]
 6 | sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential \
 7 |      llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev \
 8 |      libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev \
 9 |      libsodium-dev libsnappy-dev nasm
10 | sudo apt install -y libunwind-dev
11 | sudo apt-get install -y libgoogle-glog-dev
12 | sudo apt-get -y install docker-compose
13 | sudo apt-get install -y libre2-9 || sudo apt-get install -y libre2-10
14 | #end::gluten_deps[]
15 | 


--------------------------------------------------------------------------------
/.scalafix.conf:
--------------------------------------------------------------------------------
 1 | UnionRewrite.deprecatedMethod {
 2 |   "unionAll" = "union"
 3 | }
 4 | 
 5 | OrganizeImports {
 6 |   blankLines = Auto,
 7 |   groups = [
 8 |     "re:javax?\\."
 9 |     "scala."
10 |     "org.apache.spark."
11 |     "*"
12 |   ],
13 |   removeUnused = false
14 | }
15 | 
16 | rules = [
17 |   DisableSyntax,
18 |   SparkAutoUpgrade,
19 |   MigrateHiveContext,
20 |   MigrateToSparkSessionBuilder,
21 |   MigrateDeprecatedDataFrameReaderFuns,
22 |   AccumulatorUpgrade,
23 |   onFailureFix,
24 |   ExecutorPluginWarn,
25 |   UnionRewrite,
26 |   GroupByKeyWarn,
27 |   GroupByKeyRewrite,
28 |   MetadataWarnQQ,
29 |   ScalaTestExtendsFix,
30 |   ScalaTestImportChange
31 | ]


--------------------------------------------------------------------------------
/appveyor.yml:
--------------------------------------------------------------------------------
 1 | version: '{build}'
 2 | 
 3 | platform:
 4 |   - x86
 5 |   - x64
 6 | 
 7 | environment:
 8 |   matrix:
 9 |     - JAVA_HOME: C:\Program Files\Java\jdk1.8.0
10 | 
11 | 
12 | install:
13 | - ps: Start-FileDownload 'http://repo.typesafe.com/typesafe/ivy-releases/org.scala-sbt/sbt-launch/0.13.9/sbt-launch.jar'
14 | - xcopy sbt-launch.jar sbt\
15 | - del build.sbt
16 | - copy build_windows.sbt build.sbt
17 | 
18 | build_script:
19 | - sbt\sbt clean compile
20 | 
21 | test_script:
22 | - sbt\sbt "testOnly com.highperformancespark.examples.tools.FilterInvalidPandasSuite"
23 | 
24 | cache:
25 | - C:\Users\appveyor\.ivy2
26 | - C:\Users\appveyor\.m2
27 | - C:\Users\appveyor\.sbt


--------------------------------------------------------------------------------
/accelerators/setup_gluten_from_src.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | 
 4 | # Setup deps
 5 | source setup_gluten_deps.sh
 6 | 
 7 | # Try gluten w/clickhouse
 8 | #if [ ! -d gluten ]; then
 9 | #  git clone https://github.com/oap-project/gluten.git
10 | #  cd gluten
11 | #  bash ./ep/build-clickhouse/src/build_clickhouse.sh
12 | #fi
13 | 
14 | # Build gluten
15 | if [ ! -d gluten ]; then
16 |   # We need Spark 3.5 w/scala212
17 |   git clone git@github.com:holdenk/gluten.git || git clone https://github.com/holdenk/gluten.git
18 |   cd gluten
19 |   git checkout add-spark35-scala213-hack
20 |   ./dev/builddeps-veloxbe.sh
21 |   mvn clean package -Pbackends-velox -Pspark-3.5 -DskipTests
22 |   cd ..
23 | fi
24 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/objects/JavaCoffeeShop.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.objects;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class JavaCoffeeShop implements Serializable {
 6 |   private String zip;
 7 |   private String name;
 8 | 
 9 |   public JavaCoffeeShop(String zip, String name) {
10 |     this.zip = zip;
11 |     this.name = name;
12 |   }
13 | 
14 |   public String getZip() {
15 |     return zip;
16 |   }
17 | 
18 |   public void setZip(String zip) {
19 |     this.zip = zip;
20 |   }
21 | 
22 |   public String getName() {
23 |     return name;
24 |   }
25 | 
26 |   public void setName(String name) {
27 |     this.name = name;
28 |   }
29 | }


--------------------------------------------------------------------------------
/native/src/c/include/com_highperformancespark_examples_ffi_SumJNI.h:
--------------------------------------------------------------------------------
 1 | /* DO NOT EDIT THIS FILE - it is machine generated */
 2 | #include <jni.h>
 3 | /* Header for class com_highperformancespark_examples_ffi_SumJNI */
 4 | 
 5 | #ifndef _Included_com_highperformancespark_examples_ffi_SumJNI
 6 | #define _Included_com_highperformancespark_examples_ffi_SumJNI
 7 | #ifdef __cplusplus
 8 | extern "C" {
 9 | #endif
10 | /*
11 |  * Class:     com_highperformancespark_examples_ffi_SumJNI
12 |  * Method:    sum
13 |  * Signature: ([I)I
14 |  */
15 | JNIEXPORT jint JNICALL Java_com_highperformancespark_examples_ffi_SumJNI_sum
16 |   (JNIEnv *, jobject, jintArray);
17 | 
18 | #ifdef __cplusplus
19 | }
20 | #endif
21 | #endif
22 | 


--------------------------------------------------------------------------------
/accelerators/gluten_spark_34_ex.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 6 | cd "${SCRIPT_DIR}"
 7 | source "${SCRIPT_DIR}/setup_gluten_spark34.sh"
 8 | 
 9 | export SPARK_HOME
10 | PATH="$(pwd)/${SPARK_DIR}/bin:$PATH"
11 | export PATH
12 | "${SPARK_HOME}/bin/spark-sql" --master local[5] \
13 |   --conf spark.plugins=io.glutenproject.GlutenPlugin \
14 |   --conf spark.memory.offHeap.enabled=true \
15 |   --conf spark.memory.offHeap.size=5g \
16 |   --jars "${GLUTEN_JAR}" \
17 |   --conf spark.eventLog.enabled=true \
18 |   -e "SELECT 1"
19 | 
20 | source gluten_env_setup.sh
21 | cd ..
22 | ./run_sql_examples.sh || echo "Expected to fail"
23 | 


--------------------------------------------------------------------------------
/sql/iceberg-schema-evolution-gotcha-possibility.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS local.udevelopers_sorted;
 2 | CREATE TABLE IF NOT EXISTS local.udevelopers_sorted (
 3 |        username string,
 4 |        firstname string,
 5 |        lastname string)
 6 | USING ICEBERG;
 7 | INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova");
 8 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname;
 9 | ALTER TABLE local.udevelopers_sorted RENAME COLUMN lastname TO deprecated_lastname;
10 | SELECT * FROM local.udevelopers_sorted;
11 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username;
12 | ALTER TABLE local.udevelopers_sorted DROP COLUMN deprecated_lastname;
13 | SELECT * FROM local.udevelopers_sorted;
14 | 
15 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/native/PipeExampleSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Test our simple JNI
 3 |  */
 4 | package com.highperformancespark.examples.ffi
 5 | 
 6 | import com.holdenkarau.spark.testing._
 7 | import org.scalatest.funsuite.AnyFunSuite
 8 | import org.scalatest.matchers.should.Matchers._
 9 | import org.scalatestplus.scalacheck.Checkers
10 | 
11 | 
12 | class PipeExampleSuite extends AnyFunSuite with SharedSparkContext with Checkers {
13 |   ignore("commentors on a pr") {
14 |     val rdd = sc.parallelize(List(12883))
15 |     val expected = (12883, List("SparkQA", "srowen"))
16 |     val result = PipeExample.lookupUserPRS(sc, rdd)
17 |     assert(expected === result.collect()(0))
18 |   }
19 | }
20 | 


--------------------------------------------------------------------------------
/target-validator/runme.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC1091,SC2034
 3 | 
 4 | source ../env_setup.sh 
 5 | set -ex
 6 | export SPARK_VERSION="${SPARK_VERSION:-3.4.1}"
 7 | 
 8 | # Disable for now until the target folks agree on the PR nested builds are slow.
 9 | exit 0
10 | 
11 | git clone git@github.com:holdenk/data-validator.git || git clone https://github.com/holdenk/data-validator.git
12 | cd data-validator
13 | git checkout upgrade-to-modern-spark
14 | sbt -Dspark="${SPARK_VERSION}" clean assembly
15 | JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar"
16 | export JAR_PATH
17 | cd ..
18 | "${SPARK_HOME}/bin/spark-submit" --master local  "$JAR_PATH" --config ex.yaml || echo "Failed as expected."
19 | 


--------------------------------------------------------------------------------
/misc/kernel.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "argv": [
 3 |     "java",
 4 |     "-cp",
 5 |       "/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.2-bin-hadoop3-scala2.13/jars/*",  
 6 |     "coursier.bootstrap.launcher.Launcher",
 7 |     "--log",
 8 |     "info",
 9 |     "--metabrowse",
10 |     "--id",
11 |     "scala2.13",
12 |     "--display-name",
13 |     "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)",
14 |     "--connection-file",
15 |     "{connection_file}"
16 |   ],
17 |   "display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)",
18 |   "language": "scala"
19 | }
20 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/SQLExtension.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Extension for the SparkSession to allow us to plug in a custom optimizer
 3 |  */
 4 | 
 5 | package com.highperformancespark.examples.dataframe
 6 | 
 7 | import org.apache.spark.sql.{SparkSessionExtensions, SparkSessionExtensionsProvider}
 8 | 
 9 | class SQLExtension extends SparkSessionExtensionsProvider {
10 |   override def apply(extensions: SparkSessionExtensions): Unit = {
11 |     // There are _many different_ types of rules you can inject, here we're focused on
12 |     // making things go fast so our sample is an optimizer rule (AQE rules could also make sense).
13 |     extensions.injectOptimizerRule(session => NullabilityFilterOptimizer)
14 |   }
15 | }
16 | 


--------------------------------------------------------------------------------
/core/src/main/perl/t/pod-coverage.t:
--------------------------------------------------------------------------------
 1 | #!perl -T
 2 | use 5.006;
 3 | use strict;
 4 | use warnings;
 5 | use Test::More;
 6 | 
 7 | unless ( $ENV{RELEASE_TESTING} ) {
 8 |     plan( skip_all => "Author tests not required for installation" );
 9 | }
10 | 
11 | # Ensure a recent version of Test::Pod::Coverage
12 | my $min_tpc = 1.08;
13 | eval "use Test::Pod::Coverage $min_tpc";
14 | plan skip_all => "Test::Pod::Coverage $min_tpc required for testing POD coverage"
15 |     if $@;
16 | 
17 | # Test::Pod::Coverage doesn't require a minimum Pod::Coverage version,
18 | # but older versions don't recognize some common documentation styles
19 | my $min_pc = 0.18;
20 | eval "use Pod::Coverage $min_pc";
21 | plan skip_all => "Pod::Coverage $min_pc required for testing POD coverage"
22 |     if $@;
23 | 
24 | all_pod_coverage_ok();
25 | 


--------------------------------------------------------------------------------
/sql/iceberg-schema-evolution-gotcha-workaround.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS local.udevelopers_sorted;
 2 | CREATE TABLE IF NOT EXISTS local.udevelopers_sorted (
 3 |        username string,
 4 |        firstname string,
 5 |        lastname string)
 6 | USING ICEBERG;
 7 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY lastname;
 8 | INSERT INTO local.udevelopers_sorted VALUES("krisnova", "Kris", "Nova");
 9 | SELECT * FROM local.udevelopers_sorted;
10 | ALTER TABLE local.udevelopers_sorted WRITE ORDERED BY username;
11 | -- Hack, add it to identifier fields so we can do a "partial" drop where it stays in the schema and we don't
12 | -- corrupt the metadata.
13 | ALTER TABLE local.udevelopers_sorted ADD PARTITION FIELD lastname;
14 | ALTER TABLE local.udevelopers_sorted DROP PARTITION FIELD lastname;
15 | SELECT * FROM local.udevelopers_sorted;
16 | 


--------------------------------------------------------------------------------
/accelerators/comet_env_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | SPARK_EXTRA="
 4 | --jars ${COMET_JAR} \
 5 | --driver-class-path ${COMET_JAR} \
 6 | --conf spark.comet.enabled=true \
 7 | --conf spark.comet.exec.enabled=true \
 8 | --conf spark.comet.exec.all.enabled=true \
 9 | --conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \
10 | --conf spark.comet.exec.shuffle.enabled=true \
11 | --conf spark.comet.columnar.shuffle.enabled=true"
12 | # Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set
13 | # EXTRA_EXTENSIONS so it can be appended to iceberg
14 | if [ -z "$EXTRA_EXTENSIONS" ]; then
15 |   EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions"
16 | else
17 |   EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions,$EXTRA_EXTENSIONS"
18 | fi
19 | export EXTRA_EXTENSIONS
20 | export SPARK_EXTRA
21 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaPlace.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.objects;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.List;
 5 | 
 6 | public class JavaPandaPlace implements Serializable {
 7 |   private String name;
 8 |   private List<JavaRawPanda> pandas;
 9 | 
10 |   /**
11 |    * @param name place name
12 |    * @param pandas pandas in that place
13 |    */
14 |   public JavaPandaPlace(String name, List<JavaRawPanda> pandas) {
15 |     this.name = name;
16 |     this.pandas = pandas;
17 |   }
18 | 
19 |   public String getName() {
20 |     return name;
21 |   }
22 | 
23 |   public void setName(String name) {
24 |     this.name = name;
25 |   }
26 | 
27 |   public List<JavaRawPanda> getPandas() {
28 |     return pandas;
29 |   }
30 | 
31 |   public void setPandas(List<JavaRawPanda> pandas) {
32 |     this.pandas = pandas;
33 |   }
34 | }


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/RegularSQL.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Using plain-old-sql
 3 |  */
 4 | package com.highperformancespark.examples.dataframe
 5 | 
 6 | import org.apache.spark.sql._
 7 | 
 8 | case class RegularSQL(sqlContext: SQLContext) {
 9 | 
10 |   //tag::queryTable[]
11 |   def querySQL(): DataFrame = {
12 |     sqlContext.sql("SELECT * FROM pandas WHERE size > 0")
13 |   }
14 |   //end::queryTable[]
15 | 
16 |   // TODO: Holden: include a parquet example file and point this to that.
17 |   //tag::queryRawFile[]
18 |   def queryRawFile(): DataFrame = {
19 |     sqlContext.sql("SELECT * FROM parquet.`path_to_parquet_file`")
20 |   }
21 |   //end::queryRawFile[]
22 | 
23 |   //tag::registerTable[]
24 |   def registerTable(df: DataFrame): Unit = {
25 |     df.registerTempTable("pandas")
26 |     df.write.saveAsTable("perm_pandas")
27 |   }
28 |   //end::registerTable[]
29 | }
30 | 


--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
 1 | addSbtPlugin("org.scalastyle" %% "scalastyle-sbt-plugin" % "1.0.0")
 2 | 
 3 | resolvers += "sonatype-releases" at "https://oss.sonatype.org/content/repositories/releases/"
 4 | 
 5 | resolvers += "sonatype-snapshots" at "https://oss.sonatype.org/content/repositories/snapshots/"
 6 | 
 7 | 
 8 | addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.2")
 9 | 
10 | addDependencyTreePlugin
11 | 
12 | //tag::scalaFix[]
13 | addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.12.1")
14 | //end::scalaFix[]
15 | 
16 | //tag::sbtJNIPlugin[]
17 | addSbtPlugin("com.github.sbt" %% "sbt-jni" % "1.7.0")
18 | //end::sbtJNIPlugin[]
19 | 
20 | //tag::xmlVersionConflict[]
21 | // See https://github.com/scala/bug/issues/12632
22 | ThisBuild / libraryDependencySchemes ++= Seq(
23 |   "org.scala-lang.modules" %% "scala-xml" % VersionScheme.Always
24 | )
25 | //end::xmlVersionConflict[]
26 | 
27 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "2.2.0")
28 | 


--------------------------------------------------------------------------------
/python/examples/dual_write.py:
--------------------------------------------------------------------------------
 1 | import asyncactions  # noqa # pylint: disable=unused-import
 2 | 
 3 | 
 4 | class DualWriteExample:
 5 |     def do_write(self, df, p1, p2):
 6 |         """
 7 |         Apply two concrete actions to a DataFrame in parallel.
 8 |         A common use case is two views of the same data, normally
 9 |         one with sensitive data and one scrubbed/clean.
10 |         """
11 |         # First we "persist" it (you can also checkpoint or choose a different
12 |         # level of persistence.
13 |         df.persist()
14 |         df.count()
15 |         # Create the distinct "safe" view.
16 |         df1 = df.select("times")
17 |         # Start the async actions
18 |         async1 = df1.write.mode("append").format("parquet").saveAsync(p1)
19 |         async2 = df.write.mode("append").format("parquet").saveAsync(p2)
20 |         # Block until the writes are both finished.
21 |         async1.result()
22 |         async2.result()
23 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/errors/ThrowsSuite.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.errors
 2 | 
 3 | import com.holdenkarau.spark.testing._
 4 | import org.scalatest.funsuite.AnyFunSuite
 5 | 
 6 | class ThrowsSuite extends AnyFunSuite with SharedSparkContext {
 7 |   test("inner throw & outer throw should both throw SparkExceptions exceptions") {
 8 |     intercept[org.apache.spark.SparkException] {
 9 |       Throws.throwInner(sc)
10 |     }
11 |     intercept[org.apache.spark.SparkException] {
12 |       Throws.throwOuter(sc)
13 |     }
14 |     intercept[org.apache.spark.SparkException] {
15 |       Throws.throwInner2(sc)
16 |     }
17 |     intercept[org.apache.spark.SparkException] {
18 |       Throws.throwOuter2(sc)
19 |     }
20 |   }
21 | 
22 |   test("loading missing data should throw") {
23 |     intercept[org.apache.hadoop.mapred.InvalidInputException] {
24 |       Throws.nonExistentInput(sc)
25 |     }
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/WordCount.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples;
 2 | 
 3 | //tag::wordCount[]
 4 | import scala.Tuple2;
 5 | 
 6 | import org.apache.spark.api.java.JavaRDD;
 7 | import org.apache.spark.api.java.JavaPairRDD;
 8 | import org.apache.spark.api.java.JavaSparkContext;
 9 | 
10 | import java.util.regex.Pattern;
11 | import java.util.Arrays;
12 | 
13 | public final class WordCount {
14 |   private static final Pattern pattern = Pattern.compile(" ");
15 | 
16 |   public static void main(String[] args) throws Exception {
17 |     JavaSparkContext jsc = new JavaSparkContext();
18 |     JavaRDD<String> lines = jsc.textFile(args[0]);
19 |     JavaRDD<String> words = lines.flatMap(e -> Arrays.asList(
20 |                                             pattern.split(e)).iterator());
21 |     JavaPairRDD<String, Integer> wordsIntial = words.mapToPair(
22 |       e -> new Tuple2<String, Integer>(e, 1));
23 |   }
24 | }
25 | //end::wordCount[]
26 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/goldilocks/JoinTest.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.goldilocks
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | import com.holdenkarau.spark.testing.SharedSparkContext
 6 | import org.scalatest.funsuite.AnyFunSuite
 7 | 
 8 | 
 9 | class JoinTest extends AnyFunSuite with SharedSparkContext {
10 |   test("Hash join"){
11 |     val keySet = "a, b, c, d, e, f, g".split(",")
12 |     val smallRDD = sc.parallelize(keySet.map(letter => (letter, letter.hashCode)))
13 |     val largeRDD: RDD[(String, Double)] =
14 |       sc.parallelize(keySet.flatMap{ letter =>
15 |         Range(1, 50).map(i => (letter, letter.hashCode() / i.toDouble))})
16 |     val result: RDD[(String, (Double, Int))] =
17 |       RDDJoinExamples.manualBroadcastHashJoin(
18 |         largeRDD, smallRDD)
19 |     val nativeJoin: RDD[(String, (Double, Int))] = largeRDD.join(smallRDD)
20 | 
21 |     assert(result.subtract(nativeJoin).count == 0)
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/target-validator/ex.yaml:
--------------------------------------------------------------------------------
 1 | detailedErrors: true
 2 | numKeyCols: 4
 3 | # We might have a large number of errors so just show the first 5
 4 | numErrorsToReport: 5
 5 | 
 6 | email:
 7 |   smtpHost: smtp.example.com
 8 |   subject: Data Validation Summary
 9 |   from: data-validator-no-reply@example.com
10 |   to:
11 |     - professor-timbit@example.com
12 | 
13 | tables:
14 |   - db: gender_paygaps
15 |     table: uk
16 |     # Columns that taken together uniquely specifies each row (think of groupBy)
17 |     keyColumns:
18 |       - CompanyNumber
19 |       - EmployerId
20 |       - CompanyLinkToGPGInfo
21 |       - ResponsiblePerson
22 |     # Used to filter
23 |     condition: MaleBonusPercent >= FemaleBonusPercent
24 |     checks:
25 |       # We expect at least 500 records
26 |       - type: rowCount
27 |         minNumRows: 500
28 |       # We don't expect more than 1% not companies in the dataset.
29 |       - type: nullCheck
30 |         column: CompanyNumber
31 |         threshold: 0.01
32 | 


--------------------------------------------------------------------------------
/accelerators/setup_comet.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | source install_rust_if_needed.sh
 5 | 
 6 | if command -v protoc >/dev/null 2>&1; then
 7 |   echo "protoc already installed"
 8 | else
 9 |   sudo apt-get install -y protobuf-compiler
10 | fi
11 | 
12 | if [ -z "${SPARK_MAJOR}" ]; then
13 |   echo "Need a spark major version specified."
14 |   exit 1
15 | else
16 |   echo "Building comet for Spark ${SPARK_MAJOR}"
17 | fi
18 | 
19 | #tag::build[]
20 | # If we don't have fusion checked out do it
21 | if [ ! -d arrow-datafusion-comet ]; then
22 |   git clone https://github.com/apache/arrow-datafusion-comet.git
23 | fi
24 | 
25 | # Build JAR if not present
26 | if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then
27 |   cd arrow-datafusion-comet
28 |   make clean release PROFILES="-Pspark-${SPARK_MAJOR} -Pscala-2.13"
29 |   cd ..
30 | fi
31 | COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)"
32 | export COMET_JAR
33 | #end::build[]
34 | 


--------------------------------------------------------------------------------
/high_performance_pyspark/__init__.py:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | 
19 | """
20 | Python version of selected examples from High Performance Spark
21 | """
22 | 
23 | import os
24 | import sys
25 | 


--------------------------------------------------------------------------------
/python/examples/test_dual_write.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | # tag::test[]
 5 | from sparktestingbase.sqltestcase import SQLTestCase
 6 | from pyspark.sql.functions import current_timestamp
 7 | from pyspark.sql.types import Row
 8 | from .dual_write import DualWriteExample
 9 | 
10 | 
11 | class DualWriteTest(SQLTestCase):
12 |     def test_always_passes(self):
13 |         self.assertTrue(True)
14 | 
15 |     def test_actual_dual_write(self):
16 |         tempdir = tempfile.mkdtemp()
17 |         p1 = os.path.join(tempdir, "data1")
18 |         p2 = os.path.join(tempdir, "data2")
19 |         df = self.sqlCtx.createDataFrame([Row("timbit"), Row("farted")], ["names"])
20 |         combined = df.withColumn("times", current_timestamp())
21 |         DualWriteExample().do_write(combined, p1, p2)
22 |         df1 = self.sqlCtx.read.format("parquet").load(p1)
23 |         df2 = self.sqlCtx.read.format("parquet").load(p2)
24 |         self.assertDataFrameEqual(df2.select("times"), df1, 0.1)
25 | 
26 | 
27 | # end::test[]
28 | 


--------------------------------------------------------------------------------
/sql/wap.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS local.wap_projects;
 2 | CREATE TABLE local.wap_projects (
 3 |        creator string,
 4 |        projectname string)
 5 | USING iceberg
 6 | PARTITIONED BY (creator);
 7 | ALTER TABLE local.projects SET TBLPROPERTIES (
 8 |     'write.wap.enabled''true'
 9 | );
10 | -- We need a first commit, see https://github.com/apache/iceberg/issues/8849
11 | INSERT INTO local.wap_projects VALUES("holdenk", "spark");
12 | ALTER TABLE local.wap_projects DROP BRANCH IF EXISTS `audit-branch`;
13 | ALTER TABLE local.wap_projects CREATE BRANCH `audit-branch`;
14 | SET spark.wap.branch = 'audit-branch';
15 | INSERT INTO local.projects VALUES("krisnova", "aurae");
16 | SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator is NULL;
17 | SELECT count(*) FROM local.wap_projects VERSION AS OF 'audit-branch' WHERE creator == "krisnova";
18 | CALL local.system.remove_orphan_files(table => 'local.wap_projects');
19 | CALL local.system.fast_forward("local.wap_projects", "main", "audit-branch");
20 | 


--------------------------------------------------------------------------------
/core/src/main/perl/Makefile.PL:
--------------------------------------------------------------------------------
 1 | use 5.006;
 2 | use strict;
 3 | use warnings;
 4 | use ExtUtils::MakeMaker;
 5 | 
 6 | WriteMakefile(
 7 |     NAME             => 'HighPerformanceSpark::Examples',
 8 |     AUTHOR           => q{Holden Karau And Rachel Warren <high-performance-spark@googlegroups.com>},
 9 |     VERSION_FROM     => 'lib/HighPerformanceSpark/Examples.pm',
10 |     ABSTRACT_FROM    => 'lib/HighPerformanceSpark/Examples.pm',
11 |     LICENSE          => 'apache_2_0',
12 |     PL_FILES         => {},
13 |     EXE_FILES        => [ 'ghinfo.pl' ],
14 |     MIN_PERL_VERSION => 5.006,
15 |     CONFIGURE_REQUIRES => {
16 |         'ExtUtils::MakeMaker' => 0,
17 |     },
18 |     BUILD_REQUIRES => {
19 |         'Test::More' => 0,
20 |     },
21 |     PREREQ_PM => {
22 |         'Pithub'            => 0.01033,
23 |         #'ABC'              => 1.6,
24 |         #'Foo::Bar::Module' => 5.0401,
25 |     },
26 |     dist  => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', },
27 |     clean => { FILES => 'HighPerformanceSpark-Examples-*' },
28 | );
29 | 


--------------------------------------------------------------------------------
/se_complex.json:
--------------------------------------------------------------------------------
1 | {"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "row_dq", "rule": "bonus_checker", "column_name": "MaleBonusPercent", "expectation": "MaleBonusPercent > FemaleBonusPercent", "action_if_failed": "drop", "tag": "", "description": "Sample rule that the male bonuses should be higher. Thankfully this fails (but could be lower base pay etc.)", "enable_for_source_dq_validation": true, "enable_for_target_dq_validation": false, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1}
2 | {"product_id": "pay", "table_name": "local.3rd_fake", "rule_type": "query_dq", "rule": "history", "column_name": "MaleBonusPercent", "expectation": "(select count(*) from 3rd_fake_view) > (select input_count from local.dq_stats WHERE table_name='local.3rd_fake' LIMIT 1)", "action_if_failed": "fail", "tag": "", "description": "We should always have more records than before", "enable_for_source_dq_validation": false, "enable_for_target_dq_validation": true, "is_active": true, "enable_error_drop_alert": true, "error_drop_threshold": 1}
3 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/streaming/DStreamSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Simple tests for DStreamSuite -
 3 |  * normally we would use streaming tests but since we want to test
 4 |  * context creation as well we don't.
 5 |  */
 6 | package com.highperformancespark.examples.streaming
 7 | 
 8 | import java.lang.Thread
 9 | 
10 | import org.apache.spark.streaming._
11 | 
12 | import com.holdenkarau.spark.testing._
13 | import org.scalatest.funsuite.AnyFunSuite
14 | 
15 | class DStreamExamplesSuite extends AnyFunSuite with SharedSparkContext {
16 |   test("simple set up") {
17 |     val ssc = DStreamExamples.makeStreamingContext(sc)
18 |     val inputStream = DStreamExamples.fileAPIExample(ssc, "./")
19 |     val repartitioned = DStreamExamples.repartition(inputStream)
20 |     repartitioned.foreachRDD(rdd =>
21 |       assert(rdd.partitioner.get.numPartitions == 20)
22 |     )
23 |     ssc.start()
24 |     // This is bad don't do this - but we don't have the full test tools here
25 |     Thread.sleep(100)
26 |     ssc.stop()
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/python/setup.cfg:
--------------------------------------------------------------------------------
 1 | [metadata]
 2 | name = examples
 3 | version = attr: examples.__version__
 4 | author = Holden and Anya
 5 | author_email = your@email.address
 6 | url = https://github.com/high-performance-spark/high-performance-spark-examples
 7 | description = Python Examples for High Performance Spark
 8 | long_description = file: README.md
 9 | long_description_content_type = text/markdown
10 | keywords = example, setuptools, pyspark
11 | license = BSD 3-Clause License
12 | classifiers =
13 |     License :: OSI Approved :: BSD License
14 |     Programming Language :: Python :: 3
15 | 
16 | [options]
17 | packages = find:
18 | zip_safe = True
19 | include_package_data = True
20 | install_requires =
21 |     pandas >= 1.4.1
22 |     PyYAML >= 6.0
23 |     typer
24 |     mypy
25 |     pyspark
26 |     pyspark-asyncactions
27 |     
28 | 
29 | [options.entry_points]
30 | console_scripts = 
31 |     my-example-utility = example.example_module:main
32 | 
33 | [options.extras_require]
34 | dev = 
35 |     black>=22.1.0
36 |     flake8>=4.0.1
37 | 
38 | [options.package_data]
39 | * = README.md


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/streaming/Structured.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.structuredstreaming
 2 | 
 3 | import scala.concurrent.duration._
 4 | 
 5 | import org.apache.spark.sql._
 6 | import org.apache.spark.sql.functions._
 7 | import org.apache.spark.sql.streaming._
 8 | import org.apache.spark.sql.streaming.Trigger
 9 | 
10 | 
11 | object Structured {
12 |   def load(inputPath: String, session: SparkSession): Dataset[_] = {
13 |     //tag::loadSimple[]
14 |     session.readStream.parquet(inputPath)
15 |     //end::loadSimple[]
16 |   }
17 |   def write(counts: Dataset[_]) = {
18 |     //tag::writeComplete[]
19 |     val query = counts.writeStream.
20 |       // Specify the output mode as Complete to support aggregations
21 |       outputMode(OutputMode.Complete()).
22 |       // Write out the result as parquet
23 |       format("parquet").
24 |       // Specify the interval at which new data will be picked up
25 |       trigger(Trigger.ProcessingTime(1.second)).
26 |       queryName("pandas").start()
27 |     //end::writeComplete[]
28 |   }
29 | }
30 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/transformations/NarrowAndWide.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.transformations
 2 | 
 3 | import org.apache.spark.rdd.RDD
 4 | 
 5 | 
 6 | object NarrowAndWide {
 7 | 
 8 |   //toDO: Probably should write some sort of test for this.
 9 |   //this is used in chapter 4 for the stage diagram
10 |   def sillySparkProgram(rdd1 : RDD[Int]) = {
11 | 
12 |     //tag::narrowWide[]
13 | 
14 |     //Narrow dependency. Map the rdd to tuples  of (x, 1)
15 |     val rdd2 = rdd1.map(x => (x, 1))
16 |     //wide dependency groupByKey
17 |     val rdd3 = rdd2.groupByKey()
18 |     //end::narrowWide[]
19 | 
20 |     rdd3
21 |   }
22 |   //this is used in chapter two for the stage diagram.
23 | 
24 |   //tag::stageDiagram[]
25 |   def simpleSparkProgram(rdd : RDD[Double]): Long ={
26 |   //stage1
27 |     rdd.filter(_< 1000.0)
28 |       .map(x => (x, x) )
29 |   //stage2
30 |       .groupByKey()
31 |       .map{ case(value, groups) => (groups.sum, value)}
32 |   //stage 3
33 |       .sortByKey()
34 |       .count()
35 |   }
36 |   //end::stageDiagram[]
37 | 
38 | }
39 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/NullabilityFilterOptimizer.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Extension for the SparkSession to allow us to plug in a custom optimizer
 3 |  */
 4 | 
 5 | package com.highperformancespark.examples.dataframe
 6 | 
 7 | import org.apache.spark.sql.catalyst.optimizer._
 8 | import org.apache.spark.sql.catalyst.plans.logical._
 9 | import org.apache.spark.sql.catalyst.rules.Rule
10 | import org.apache.spark.sql.catalyst.trees.TreePattern._
11 | import org.apache.spark.sql.catalyst.expressions.{And, IsNotNull}
12 | 
13 | object NullabilityFilterOptimizer extends Rule[LogicalPlan] {
14 | 
15 |   def apply(plan: LogicalPlan): LogicalPlan = {
16 |     plan.transform {
17 |       case p @ Project(projectList, projChild) =>
18 |         val children = projectList.flatMap(_.children)
19 |         // If there are no null intolerant children don't worry about it
20 |         if (children.isEmpty) {
21 |           p
22 |         } else {
23 |           val filterCond = children.map(IsNotNull(_)).reduceLeft(And)
24 |           Project(projectList, Filter(filterCond, projChild))
25 |         }
26 |     }
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/RawPandas.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.dataframe
 2 | 
 3 | import java.util.Arrays
 4 | import java.util.Objects
 5 | 
 6 | /**
 7 |  * @param id panda id
 8 |  * @param zip zip code of panda residence
 9 |  * @param pt Type of panda as a string
10 |  * @param happy if panda is happy
11 |  * @param attributes array of panada attributes
12 |  */
13 | case class RawPanda(id: Long, zip: String, pt: String,
14 |   happy: Boolean, attributes: Array[Double]) {
15 |   override def equals(o: Any) = o match {
16 |     case other: RawPanda => (id == other.id && pt == other.pt &&
17 |         happy == other.happy && attributes.sameElements(other.attributes))
18 |     case _ => false
19 |   }
20 |   override def hashCode(): Int = {
21 |     3 * Objects.hashCode(id) + 7 * Objects.hashCode(zip) +
22 |     11 * Objects.hashCode(pt) + 13 * Arrays.hashCode(attributes)
23 |   }
24 | }
25 | 
26 | /**
27 |  * @param name place name
28 |  * @param pandas pandas in that place
29 |  */
30 | case class PandaPlace(name: String, pandas: Array[RawPanda])
31 | 
32 | case class CoffeeShop(zip: String, name: String)
33 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/tokenize/SampleTokenizeSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Simple tests for tokenization
 3 |  */
 4 | package com.highperformancespark.examples.tokenize
 5 | 
 6 | import java.lang.Thread
 7 | 
 8 | import org.apache.spark.streaming._
 9 | 
10 | import com.holdenkarau.spark.testing._
11 | import org.scalatest.funsuite.AnyFunSuite
12 | 
13 | class SampleTokenizeSuite extends AnyFunSuite with SharedSparkContext {
14 |   val input = List("hi holden", "I like coffee")
15 |   val expected = List("hi", "holden", "I", "like", "coffee")
16 | 
17 |   test("test the difficult to test one") {
18 |     val inputRDD = sc.parallelize(input)
19 |     val result = SampleTokenize.difficultTokenizeRDD(inputRDD).collect()
20 |     assert(result.toList == expected)
21 |   }
22 | 
23 |   test("test the easy to test one like the difficult one") {
24 |     val inputRDD = sc.parallelize(input)
25 |     val result = SampleTokenize.tokenizeRDD(inputRDD).collect()
26 |     assert(result.toList == expected)
27 |   }
28 | 
29 |   test("test the easy inner function - note no SC needed") {
30 |     assert(SampleTokenize.tokenize("hi holden").toList == List("hi", "holden"))
31 |   }
32 | }
33 | 


--------------------------------------------------------------------------------
/python/examples/load_previous_run_data.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | 
 5 | class LoadPreviousRunData(object):
 6 |     def __init__(self, session):
 7 |         self.session = session
 8 | 
 9 |     def find_oldest_id(self, local_path):
10 |         """Find the oldest Spark job since it's probably not being updated."""
11 |         directories = os.listdir(local_path)
12 |         return min(directories, key=lambda x: os.path.getmtime(f"{local_path}/{x}"))
13 | 
14 |     def do_magic(self):
15 |         local_path = "/tmp/spark-events"
16 |         event_log_path = f"file://{local_path}"
17 |         application_id = self.find_oldest_id(local_path)
18 |         return self.load_json_records(event_log_path, application_id)
19 | 
20 |     # tag::load[]
21 |     def load_json_records(self, event_log_path, application_id):
22 |         print(f"Loading {application_id}")
23 |         full_log_path = f"{event_log_path}/{application_id}"
24 |         df = self.session.read.json(full_log_path)
25 |         special_events = df.filter(
26 |             (df["Event"] == "SparkListenerExecutorAdded")
27 |             | (df["Event"] == "SparkListenerJobEnd")
28 |         )
29 |         special_events.show()
30 | 
31 |     # end::load[]
32 | 


--------------------------------------------------------------------------------
/accelerators/gluten_env_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Check if we gluten and gluten UDFs present
 4 | GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so
 5 | NATIVE_LIB_DIR=$(pwd)/../native/src/
 6 | NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}"
 7 | GLUTEN_HOME=incubator-gluten
 8 | source /etc/lsb-release
 9 | if [ -n "$GLUTEN_JAR_PATH" ]; then
10 |   GLUTEN_EXISTS="true"
11 |   GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \
12 |      --conf spark.memory.offHeap.enabled=true \
13 |      --conf spark.memory.offHeap.size=5g \
14 |      --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
15 |      --jars ${GLUTEN_JAR_PATH}"
16 | fi
17 | if [ -f "${NATIVE_LIB_PATH}" ]; then
18 |   if [ "$GLUTEN_EXISTS" == "true" ]; then
19 |     GLUTEN_UDF_EXISTS="true"
20 |     GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \
21 |       --conf spark.jars=${GLUTEN_JAR_PATH} \
22 |       --conf spark.gluten.loadLibFromJar=true \
23 |       --files ${NATIVE_LIB_PATH} \
24 |       --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}"
25 |   fi
26 | fi
27 | SPARK_EXTRA=GLUTEN_SPARK_EXTRA
28 | 
29 | export SPARK_EXTRA
30 | export GLUTEN_UDF_EXISTS
31 | export GLUTEN_EXISTS
32 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |   * Simple tests for our CustomPipeline demo pipeline stage
 3 |   */
 4 | package com.highperformancespark.examples.ml
 5 | 
 6 | import org.apache.spark.sql.Dataset
 7 | 
 8 | import com.holdenkarau.spark.testing.DataFrameSuiteBase
 9 | import org.scalatest.funsuite.AnyFunSuite
10 | 
11 | case class TestRow(id: Int, inputColumn: String)
12 | 
13 | class CustomPipelineSuite extends AnyFunSuite with DataFrameSuiteBase {
14 |   val d = List(
15 |     TestRow(0, "a"),
16 |     TestRow(1, "b"),
17 |     TestRow(2, "c"),
18 |     TestRow(3, "a"),
19 |     TestRow(4, "a"),
20 |     TestRow(5, "c")
21 |   )
22 | 
23 |   test("test spark context") {
24 |     val session = spark
25 |     val rdd = session.sparkContext.parallelize(1 to 10)
26 |     assert(rdd.sum === 55)
27 |   }
28 | 
29 |   test("simple indexer test") {
30 |     val session = spark
31 |     import session.implicits._
32 |     val ds: Dataset[TestRow] = session.createDataset(d)
33 |     val indexer = new SimpleIndexer()
34 |     indexer.setInputCol("inputColumn")
35 |     indexer.setOutputCol("categoryIndex")
36 |     val model = indexer.fit(ds)
37 |     val predicted = model.transform(ds)
38 |     assert(predicted.columns.contains("categoryIndex"))
39 |     predicted.show()
40 |   }
41 | }


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/objects/JavaPandas.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.objects;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class JavaPandas implements Serializable {
 6 |   private String name;
 7 |   private String zip;
 8 |   private int pandaSize;
 9 |   private int age;
10 | 
11 |   /**
12 |    * @param name      name of panda
13 |    * @param zip       zip code
14 |    * @param pandaSize size of panda in KG
15 |    * @param age       age of panda
16 |    */
17 |   public JavaPandas(String name, String zip, int pandaSize, int age) {
18 |     this.name = name;
19 |     this.zip = zip;
20 |     this.pandaSize = pandaSize;
21 |     this.age = age;
22 |   }
23 | 
24 |   public String getName() {
25 |     return name;
26 |   }
27 | 
28 |   public void setName(String name) {
29 |     this.name = name;
30 |   }
31 | 
32 |   public String getZip() {
33 |     return zip;
34 |   }
35 | 
36 |   public void setZip(String zip) {
37 |     this.zip = zip;
38 |   }
39 | 
40 |   public int getPandaSize() {
41 |     return pandaSize;
42 |   }
43 | 
44 |   public void setPandaSize(int pandaSize) {
45 |     this.pandaSize = pandaSize;
46 |   }
47 | 
48 |   public int getAge() {
49 |     return age;
50 |   }
51 | 
52 |   public void setAge(int age) {
53 |     this.age = age;
54 |   }
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlibSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Basic tests for our MLlib examples
 3 |  */
 4 | package com.highperformancespark.examples.mllib
 5 | 
 6 | import org.apache.spark.mllib.linalg.{Vector => SparkVector}
 7 | 
 8 | import com.highperformancespark.examples.dataframe.RawPanda
 9 | import com.holdenkarau.spark.testing._
10 | import org.scalatest.funsuite.AnyFunSuite
11 | 
12 | class GoldilocksMLlibSuite extends AnyFunSuite with SharedSparkContext {
13 |   val rps = List(
14 |     RawPanda(1L, "94110", "giant", true, Array(0.0, 0.0)),
15 |     RawPanda(2L, "94110", "giant", false, Array(0.0, 3.0)),
16 |     RawPanda(3L, "94110", "giant", true, Array(0.0, 2.0)))
17 | 
18 |   test("boolean to double") {
19 |     assert(1.0 === GoldilocksMLlib.booleanToDouble(true))
20 |     assert(0.0 === GoldilocksMLlib.booleanToDouble(false))
21 |   }
22 | 
23 |   test("encoding") {
24 |     val input = sc.parallelize(rps)
25 |     val points = GoldilocksMLlib.toLabeledPointDense(input)
26 |     assert(points.count() == 3)
27 |     assert(points.filter(_.label != 0.0).count() == 2)
28 |   }
29 | 
30 |   test("lookup table") {
31 |     val input = sc.parallelize(List("hi", "bye", "coffee", "hi"))
32 |     val table = GoldilocksMLlib.createLabelLookup(input)
33 |     assert(table.size == 3)
34 |   }
35 | 
36 | }
37 | 


--------------------------------------------------------------------------------
/python/examples/test_dual_write_new.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import tempfile
 3 | 
 4 | # tag::test[]
 5 | import unittest
 6 | from pyspark.sql import SparkSession
 7 | from pyspark.sql.functions import current_timestamp
 8 | from pyspark.sql.types import Row
 9 | from pyspark.testing.utils import assertDataFrameEqual
10 | from .dual_write import DualWriteExample
11 | 
12 | 
13 | class DualWriteTest(unittest.TestCase):
14 |     @classmethod
15 |     def setUpClass(cls):
16 |         cls.spark = SparkSession.builder.appName(
17 |             "Testing PySpark Example"
18 |         ).getOrCreate()
19 | 
20 |     @classmethod
21 |     def tearDownClass(cls):
22 |         cls.spark.stop()
23 | 
24 |     def test_always_passes(self):
25 |         self.assertTrue(True)
26 | 
27 |     def test_actual_dual_write(self):
28 |         tempdir = tempfile.mkdtemp()
29 |         p1 = os.path.join(tempdir, "data1")
30 |         p2 = os.path.join(tempdir, "data2")
31 |         df = self.spark.createDataFrame([Row("timbit"), Row("farted")], ["names"])
32 |         combined = df.withColumn("times", current_timestamp())
33 |         DualWriteExample().do_write(combined, p1, p2)
34 |         df1 = self.spark.read.format("parquet").load(p1)
35 |         df2 = self.spark.read.format("parquet").load(p2)
36 |         assertDataFrameEqual(df2.select("times"), df1, 0.1)
37 | 
38 | 
39 | # end::test[]
40 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/JavaInterop.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples;
 2 | 
 3 | import scala.reflect.*;
 4 | import scala.Tuple2;
 5 | 
 6 | import org.apache.spark.rdd.RDD;
 7 | import org.apache.spark.api.java.JavaRDD;
 8 | import org.apache.spark.api.java.JavaPairRDD;
 9 | import org.apache.spark.api.java.JavaSparkContext;
10 | 
11 | import java.util.HashMap;
12 | import java.util.Map;
13 | 
14 | import static org.apache.spark.sql.functions.*;
15 | 
16 | public class JavaInterop {
17 | 
18 |   //tag::realClassTag[]
19 |   public static JavaPairRDD wrapPairRDD(
20 |     RDD<Tuple2<String, Object>> rdd) {
21 |     // Construct the class tags
22 |     ClassTag<String> strCt = ClassTag$.MODULE$.apply(String.class);
23 |     ClassTag<Long> longCt = ClassTag$.MODULE$.apply(scala.Long.class);
24 |     return new JavaPairRDD(rdd, strCt, longCt);
25 |   }
26 |   //end::realClassTag[]
27 | 
28 |   //tag::fakeClassTag[]
29 |   public static JavaPairRDD wrapPairRDDFakeCt(
30 |     RDD<Tuple2<String, Object>> rdd) {
31 |     // Construct the class tags by casting AnyRef - this would be more commonly done
32 |     // with generic or templated code where we can't explicitly construct the correct
33 |     // class tag as using fake class tags may result in degraded performance.
34 |     ClassTag<Object> fake = ClassTag$.MODULE$.AnyRef();
35 |     return new JavaPairRDD(rdd, fake, fake);
36 |   }
37 |   //end::fakeClassTag[]
38 | }
39 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/tools/FilterInvalidPandasSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Tests that we filter out bad pandas.
 3 |  */
 4 | package com.highperformancespark.examples.tools
 5 | 
 6 | import com.highperformancespark.examples.dataframe.RawPanda
 7 | import com.holdenkarau.spark.testing._
 8 | import org.scalatest.funsuite.AnyFunSuite
 9 | 
10 | class FilterInvalidPandasSuite extends AnyFunSuite with SharedSparkContext {
11 |   test("simple filter") {
12 |     val invalidPandas = List(1L, 2L)
13 |     val inputPandas = List(
14 |       RawPanda(1L, "94110", "giant", true, Array(0.0)),
15 |       RawPanda(3L, "94110", "giant", true, Array(0.0)))
16 |     val input = sc.parallelize(inputPandas)
17 |     val result1 =
18 |       FilterInvalidPandas.filterInvalidPandas(sc, invalidPandas, input)
19 |     val result2 =
20 |       FilterInvalidPandas.filterInvalidPandasWithLogs(sc, invalidPandas, input)
21 |     assert(result1.collect() === result2.collect())
22 |     assert(result1.count() === 1)
23 |   }
24 | 
25 |   test("alt log") {
26 |     val invalidPandas = List(1L, 2L)
27 |     val inputPandas = List(
28 |       RawPanda(1L, "94110", "giant", true, Array(0.0)),
29 |       RawPanda(3L, "94110", "giant", true, Array(0.0)))
30 |     val input = sc.parallelize(inputPandas)
31 |     val al = new AltLog()
32 |     val result1 =
33 |       al.filterInvalidPandasWithLogs(sc, invalidPandas, input)
34 |     assert(result1.count() === 1)
35 |   }
36 | }
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # high-performance-spark-examples
 2 | Examples for High Performance Spark
 3 | 
 4 | We are in the progress of updata this for Spark 4 (some parts depending on external libraries like Iceberg, Comet, etc. are still 3.X) and the 2ed edition of our book!
 5 | 
 6 | # Building
 7 | 
 8 | Most of the examples can be built with sbt, the C and Fortran components depend on gcc, g77, and cmake.
 9 | 
10 | # Tests
11 | 
12 | The full test suite depends on having the C and Fortran components built as well as a local R installation available.
13 | 
14 | The most "accuate" way of seeing how we run the tests is to look at the .github workflows
15 | 
16 | # History Server
17 | 
18 | The history server can be a great way to figure out what's going on.
19 | 
20 | By default the history server writes to `/tmp/spark-events` so you'll need to create that directory if not setup with
21 | 
22 | `mkdir -p /tmp/spark-events`
23 | 
24 | The scripts for running the examples generally run with the event log enabled.
25 | 
26 | You can set the SPARK_EVENTLOG=true before running the scala tests and you'll get the history server too!
27 | 
28 | e.g.
29 | 
30 | `SPARK_EVENTLOG=true sbt test`
31 | 
32 | If you want to run just a specific test you can run [testOnly](https://www.scala-sbt.org/1.x/docs/Testing.html)
33 | 
34 | Then to view the history server you'll want to launch it using the `${SPARK_HOME}/sbin/start-history-server.sh` then you [can go to your local history server](http://localhost:18080/)
35 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/transformations/Accumulators.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Test that the accumulator example computes stuff.
 3 |  */
 4 | package com.highperformancespark.examples.transformations
 5 | 
 6 | import scala.collection.immutable.HashSet
 7 | 
 8 | import com.highperformancespark.examples.dataframe.RawPanda
 9 | import com.holdenkarau.spark.testing._
10 | import org.scalatest.funsuite.AnyFunSuite
11 | 
12 | class AccumulatorsTest extends AnyFunSuite with SharedSparkContext {
13 |   test("accumulator max should function") {
14 |     val input = sc.parallelize(1.to(100)).map(x =>
15 |       RawPanda(1L, "1", "red", true, Array(x.toDouble)))
16 |     val (_, max) = Accumulators.computeMaxFuzzyNess(sc, input)
17 |     assert(max === 100.0)
18 |   }
19 | 
20 |   test("accumulator sum should function") {
21 |     val input = sc.parallelize(1.to(100)).map(x =>
22 |       RawPanda(1L, "1", "red", true, Array(x.toDouble)))
23 |     val (_, sum) = Accumulators.computeTotalFuzzyNess(sc, input)
24 |     assert(sum === 5050.0)
25 |   }
26 | 
27 |   test("accumulator unique should function") {
28 |     val input1 = sc.parallelize(1 to 100).map(x =>
29 |       RawPanda(1L, "1", "red", true, Array(x.toDouble))
30 |     )
31 | 
32 |     val input2 = sc.parallelize(1 to 100).map(x =>
33 |       RawPanda(2L, "2", "blude", false, Array(x.toDouble))
34 |     )
35 | 
36 |     val set = Accumulators.uniquePandas(sc, input1 ++ input2)
37 |     assert(set == HashSet(2, 1))
38 |   }
39 | }
40 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/tools/ResourceProfileEx.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.gpu
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | import org.apache.spark.resource._
 5 | import org.apache.spark.resource.ResourceProfileBuilder
 6 | import org.apache.spark.TaskContext
 7 | 
 8 | object GPUResourceProfileExample {
 9 |   def main(args: Array[String]): Unit = {
10 |     val spark = SparkSession.builder()
11 |       .appName("GPUResourceProfileExample")
12 |       .getOrCreate()
13 |     run(spark)
14 |   }
15 | 
16 |   def run(spark: SparkSession) = {
17 |     val sc = spark.sparkContext
18 |     //tag::gpuResourceProfileExample[]
19 |     // Create a resource profile requesting 2 NVIDIA GPUs per executor and 1 per task
20 |     val gpuResourceProfile = new ResourceProfileBuilder()
21 |       .require(new ExecutorResourceRequests().resource(
22 |         "gpu", 2, vendor="nvidia",
23 |         discoveryScript="/opt/spark/bin/getGpusResources.sh" // See sample in Spark repo
24 |       ))
25 |       .require(new TaskResourceRequests().resource("gpu", 1))
26 |       .build()
27 | 
28 |     // Use resource profile to run on a machine with GPUs.
29 |     val rdd = sc.parallelize(1 to 4, 4)
30 |       .withResources(gpuResourceProfile)
31 |       .map { i =>
32 |         // Do some special GPU stuff here my friend
33 |         i
34 |       }
35 |     //end::gpuResourceProfileExample[]
36 | 
37 |     rdd.collect().foreach(println)
38 | 
39 |     spark.stop()
40 |   }
41 | }
42 | 


--------------------------------------------------------------------------------
/core/src/main/r/dapply.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | #tag::example[]
19 | library(SparkR)
20 | 
21 | # Setup SparkContext & SQLContext
22 | sc <- sparkR.init(appName="high-performance-spark-wordcount-example")
23 | 
24 | # Initialize SQLContext
25 | sqlContext <- sparkRSQL.init(sc)
26 | 
27 | 
28 | # Count the number of characters - note this fails on the text DF due to a bug.
29 | df <- createDataFrame (sqlContext,
30 |   list(list(1L, 1, "1"),
31 |   list(2L, 2, "22"),
32 |   list(3L, 3, "333")),
33 |   c("a", "b", "c"))
34 | resultingSchema <- structType(structField("length", "integer"))
35 | result <- dapply(df, function(row) {
36 |   y <- list()
37 |   y <- cbind(y, nchar(row[[3]]))
38 | }, resultingSchema)
39 | showDF(result)
40 | #end::example[]
41 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/objects/JavaPandaInfo.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.objects;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | public class JavaPandaInfo implements Serializable {
 6 |   private String place;
 7 |   private String pandaType;
 8 |   private int happyPandas;
 9 |   private int totalPandas;
10 | 
11 |   /**
12 |    * @param place       name of place
13 |    * @param pandaType   type of pandas in this place
14 |    * @param happyPandas number of happy pandas in this place
15 |    * @param totalPandas total number of pandas in this place
16 |    */
17 |   public JavaPandaInfo(String place, String pandaType, int happyPandas, int totalPandas) {
18 |     this.place = place;
19 |     this.pandaType = pandaType;
20 |     this.happyPandas = happyPandas;
21 |     this.totalPandas = totalPandas;
22 |   }
23 | 
24 |   public String getPlace() {
25 |     return place;
26 |   }
27 | 
28 |   public void setPlace(String place) {
29 |     this.place = place;
30 |   }
31 | 
32 |   public String getPandaType() {
33 |     return pandaType;
34 |   }
35 | 
36 |   public void setPandaType(String pandaType) {
37 |     this.pandaType = pandaType;
38 |   }
39 | 
40 |   public int getHappyPandas() {
41 |     return happyPandas;
42 |   }
43 | 
44 |   public void setHappyPandas(int happyPandas) {
45 |     this.happyPandas = happyPandas;
46 |   }
47 | 
48 |   public int getTotalPandas() {
49 |     return totalPandas;
50 |   }
51 | 
52 |   public void setTotalPandas(int totalPandas) {
53 |     this.totalPandas = totalPandas;
54 |   }
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/python/tox.ini:
--------------------------------------------------------------------------------
 1 | [tox]
 2 | passenv = *
 3 | isolated_build = True
 4 | requires = tox-conda
 5 | envlist =
 6 |     isort
 7 |     py310
 8 |     black
 9 |     mypy
10 |     flake8
11 | 
12 | skip_missing_interpeters = true
13 | 
14 | [gh-actions]
15 | python =
16 | #    3.9: py39
17 | # We need a new version of PySpark w/3.10 support.
18 |     3.10: py310
19 | 
20 | [testenv]
21 | setenv =
22 |     DJANGO_SETTINGS_MODULE=fighthealthinsurance.settings
23 |     PYTHONPATH={toxinidir}
24 |     DJANGO_CONFIGURATION=Dev
25 | passenv = *
26 | extras =
27 |     tests
28 |     coverage
29 | deps =
30 |   pytest
31 |   isort==4.3.21
32 |   pyspark==3.5.0
33 |   flake8
34 |   spark-testing-base>=0.11.1
35 |   mypy
36 |   -rrequirements.txt
37 | commands =
38 |   pytest examples \
39 |    {posargs}
40 | allowlist_externals = pytest
41 | 
42 | [testenv:isort]
43 | extras = tests
44 | skipsdist = True
45 | commands = isort --check-only --diff examples
46 | allowlist_externals = isort
47 | 
48 | [testenv:black]
49 | extras = tests
50 | skipsdist = True
51 | commands = black --check examples
52 | allowlist_externals = black
53 | deps =
54 |   black
55 |   -rrequirements.txt
56 | 
57 | [testenv:flake8]
58 | extras = tests
59 | skipsdist = True
60 | commands = flake8 --ignore=F403,E402,F401,F405,W503,E265 examples
61 | allowlist_externals = flake8
62 | 
63 | [testenv:mypy]
64 | extras = tests
65 | passenv = *
66 | deps =
67 |   pytest
68 |   mypy
69 |   -rrequirements.txt
70 | setenv =
71 |     {[testenv]setenv}
72 |     MYPYPATH={toxinidir}
73 | commands =
74 |     mypy -m examples
75 | allowlist_externals = mypy


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/tools/GenerateScalingDataSuite.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Verify that generate scaling data returns results
 3 |  */
 4 | package com.highperformancespark.examples.tools
 5 | 
 6 | import com.highperformancespark.examples.dataframe.RawPanda
 7 | import com.holdenkarau.spark.testing._
 8 | import org.scalatest.funsuite.AnyFunSuite
 9 | 
10 | class GeneratescalaingDataSuite extends AnyFunSuite with SharedSparkContext {
11 |   // The number of entries depends somewhat on the partition split because we
12 |   // zip multiple separate RDDs so its more of a "request"
13 |   test("expected num entries") {
14 |     val result = GenerateScalingData.generateFullGoldilocks(sc, 10L, 20)
15 |     assert(result.count() <= 10)
16 |     assert(result.count() > 5)
17 |     assert(result.map(_.id).distinct().count() > 1)
18 |   }
19 | 
20 |   test("expected num entries same id") {
21 |     val result = GenerateScalingData.generateGoldilocks(sc, 5L, 20)
22 |     assert(result.count() <= 5)
23 |     assert(result.count() >= 2)
24 |     assert(result.map(_.id).distinct().count() == 1)
25 |   }
26 | 
27 |   test("mini scale data") {
28 |     val result = GenerateScalingData.generateMiniScale(sc, 20L, 1)
29 |     assert(result.count() <= 20)
30 |     assert(result.count() > 5)
31 |     assert(result.map(_._1).distinct().count() > 1)
32 |   }
33 | 
34 |   test("mini scale rows") {
35 |     val result = GenerateScalingData.generateMiniScaleRows(sc, 20L, 1)
36 |     assert(result.count() <= 20)
37 |     assert(result.count() > 5)
38 |     assert(result.map(_(0)).distinct().count() > 1)
39 |   }
40 | }
41 | 


--------------------------------------------------------------------------------
/core/src/main/perl/xt/boilerplate.t:
--------------------------------------------------------------------------------
 1 | #!perl -T
 2 | use 5.006;
 3 | use strict;
 4 | use warnings;
 5 | use Test::More;
 6 | 
 7 | plan tests => 3;
 8 | 
 9 | sub not_in_file_ok {
10 |     my ($filename, %regex) = @_;
11 |     open( my $fh, '<', $filename )
12 |         or die "couldn't open $filename for reading: $!";
13 | 
14 |     my %violated;
15 | 
16 |     while (my $line = <$fh>) {
17 |         while (my ($desc, $regex) = each %regex) {
18 |             if ($line =~ $regex) {
19 |                 push @{$violated{$desc}||=[]}, $.;
20 |             }
21 |         }
22 |     }
23 | 
24 |     if (%violated) {
25 |         fail("$filename contains boilerplate text");
26 |         diag "$_ appears on lines @{$violated{$_}}" for keys %violated;
27 |     } else {
28 |         pass("$filename contains no boilerplate text");
29 |     }
30 | }
31 | 
32 | sub module_boilerplate_ok {
33 |     my ($module) = @_;
34 |     not_in_file_ok($module =>
35 |         'the great new $MODULENAME'   => qr/ - The great new /,
36 |         'boilerplate description'     => qr/Quick summary of what the module/,
37 |         'stub function definition'    => qr/function[12]/,
38 |     );
39 | }
40 | 
41 | TODO: {
42 |   local $TODO = "Need to replace the boilerplate text";
43 | 
44 |   not_in_file_ok(README =>
45 |     "The README is used..."       => qr/The README is used/,
46 |     "'version information here'"  => qr/to provide version information/,
47 |   );
48 | 
49 |   not_in_file_ok(Changes =>
50 |     "placeholder date/time"       => qr(Date/time)
51 |   );
52 | 
53 |   module_boilerplate_ok('lib/HighPerformanceSpark/Examples.pm');
54 | 
55 | 
56 | }
57 | 
58 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/objects/JavaRawPanda.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.objects;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.List;
 5 | 
 6 | public class JavaRawPanda implements Serializable {
 7 |   private long id;
 8 |   private String zip;
 9 |   private String pt;
10 |   private boolean happy;
11 |   private List<Double> attributes;
12 | 
13 |   /**
14 |    * @param id panda id
15 |    * @param zip zip code of panda residence
16 |    * @param pt Type of panda as a string
17 |    * @param happy if panda is happy
18 |    * @param attributes array of panada attributes
19 |    */
20 |   public JavaRawPanda(long id, String zip, String pt, boolean happy, List<Double> attributes) {
21 |     this.attributes = attributes;
22 |     this.id = id;
23 |     this.zip = zip;
24 |     this.pt = pt;
25 |     this.happy = happy;
26 |   }
27 | 
28 |   public long getId() {
29 |     return id;
30 |   }
31 | 
32 |   public void setId(long id) {
33 |     this.id = id;
34 |   }
35 | 
36 |   public String getZip() {
37 |     return zip;
38 |   }
39 | 
40 |   public void setZip(String zip) {
41 |     this.zip = zip;
42 |   }
43 | 
44 |   public String getPt() {
45 |     return pt;
46 |   }
47 | 
48 |   public void setPt(String pt) {
49 |     this.pt = pt;
50 |   }
51 | 
52 |   public boolean isHappy() {
53 |     return happy;
54 |   }
55 | 
56 |   public void setHappy(boolean happy) {
57 |     this.happy = happy;
58 |   }
59 | 
60 |   public List<Double> getAttributes() {
61 |     return attributes;
62 |   }
63 | 
64 |   public void setAttributes(List<Double> attributes) {
65 |     this.attributes = attributes;
66 |   }
67 | }


--------------------------------------------------------------------------------
/core/src/test/java/com/highperformancespark/examples/JavaInteropTest.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples;
 2 | 
 3 | import com.holdenkarau.spark.testing.SharedJavaSparkContext;
 4 | 
 5 | import scala.Tuple2;
 6 | 
 7 | import org.apache.spark.rdd.RDD;
 8 | import org.apache.spark.api.java.JavaRDD;
 9 | import org.apache.spark.api.java.JavaPairRDD;
10 | import org.apache.spark.api.java.JavaSparkContext;
11 | 
12 | import java.util.ArrayList;
13 | import java.util.Arrays;
14 | import java.util.List;
15 | 
16 | import static org.junit.Assert.*;
17 | 
18 | import org.junit.Test;
19 | import static org.junit.Assert.assertEquals;
20 | import static org.junit.Assert.assertTrue;
21 | 
22 | public class JavaInteropTest extends SharedJavaSparkContext {
23 |   
24 |   @Test
25 |   public void wrapPairRDDTest() {
26 |     JavaInteropTestHelper helper = new JavaInteropTestHelper(sc());
27 |     JavaInterop ji = new JavaInterop();
28 |     RDD<Tuple2<String, Object>> rdd = helper.generateMiniPairRDD();
29 |     JavaPairRDD prdd = ji.wrapPairRDD(rdd);
30 |     List<Tuple2<String, Long>> expected = Arrays.asList(new Tuple2<String, Long>("panda", 12L));
31 |     assertEquals(expected, prdd.collect());
32 |   }
33 | 
34 |   @Test
35 |   public void wrapPairRDDFakeCtTest() {
36 |     JavaInteropTestHelper helper = new JavaInteropTestHelper(sc());
37 |     JavaInterop ji = new JavaInterop();
38 |     RDD<Tuple2<String, Object>> rdd = helper.generateMiniPairRDD();
39 |     JavaPairRDD prdd = ji.wrapPairRDDFakeCt(rdd);
40 |     List<Tuple2<String, Long>> expected = Arrays.asList(new Tuple2<String, Long>("panda", 12L));
41 |     assertEquals(expected, prdd.collect());
42 |   }
43 | }
44 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/wordcount/WordCount.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.wordcount
 2 | 
 3 | /**
 4 |  * What sort of big data book would this be if we didn't mention wordcount?
 5 |  */
 6 | import org.apache.spark.rdd._
 7 | 
 8 | object WordCount {
 9 |   // bad idea: uses group by key
10 |   def badIdea(rdd: RDD[String]): RDD[(String, Int)] = {
11 |     val words = rdd.flatMap(_.split(" "))
12 |     val wordPairs = words.map((_, 1))
13 |     val grouped = wordPairs.groupByKey()
14 |     val wordCounts = grouped.mapValues(_.sum)
15 |     wordCounts
16 |   }
17 | 
18 |   // good idea: doesn't use group by key
19 |   //tag::simpleWordCount[]
20 |   def simpleWordCount(rdd: RDD[String]): RDD[(String, Int)] = {
21 |     val words = rdd.flatMap(_.split(" "))
22 |     val wordPairs = words.map((_, 1))
23 |     val wordCounts = wordPairs.reduceByKey(_ + _)
24 |     wordCounts
25 |   }
26 |   //end::simpleWordCount[]
27 | 
28 |   /**
29 |     * Come up with word counts but filter out the illegal tokens and stop words
30 |     */
31 |   //tag::wordCountStopwords[]
32 |   def withStopWordsFiltered(rdd : RDD[String], illegalTokens : Array[Char],
33 |     stopWords : Set[String]): RDD[(String, Int)] = {
34 |     val separators = illegalTokens ++ Array[Char](' ')
35 |     val tokens: RDD[String] = rdd.flatMap(_.split(separators).
36 |       map(_.trim.toLowerCase))
37 |     val words = tokens.filter(token =>
38 |       !stopWords.contains(token) && (token.length > 0) )
39 |     val wordPairs = words.map((_, 1))
40 |     val wordCounts = wordPairs.reduceByKey(_ + _)
41 |     wordCounts
42 |   }
43 |   //end::wordCountStopwords[]
44 | }
45 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/dataframe/PandaPlaceFilterPushdown.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Happy Panda Example for DataFrames.
 3 |  * Computes the % of happy pandas. Very contrived.
 4 |  */
 5 | package com.highperformancespark.examples.dataframe
 6 | 
 7 | import scala.collection.mutable
 8 | import scala.util.Random
 9 | 
10 | import org.apache.spark.sql.DataFrame
11 | import org.apache.spark.sql.Row
12 | import org.apache.spark.sql.{SQLContext, SparkSession}
13 | import org.apache.spark.sql.types._
14 | 
15 | import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
16 | import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
17 | import com.holdenkarau.spark.testing._
18 | import org.scalatest.funsuite.AnyFunSuite
19 | import org.scalatest.matchers.should.Matchers._
20 | 
21 | case class ExtraMagic(
22 |   place: String,
23 |   pandaType: String,
24 |   happyPandas: Integer,
25 |   totalPandas: Integer,
26 |   extraInfo: Integer)
27 | 
28 | 
29 | class PandaPlaceFilterPushdown extends AnyFunSuite with DataFrameSuiteBase {
30 | 
31 |   override def appName: String = "pandaPlaceFilterPushdown"
32 | 
33 |   val basicList = List(
34 |     ExtraMagic("a", "b", 1, 2, 3),
35 |     ExtraMagic("toronto", "b", 1, 2, 3),
36 |   )
37 | 
38 |   test("simpleFilterTest") {
39 |     val sqlCtx = sqlContext
40 |     import sqlCtx.implicits._
41 |     val inputDF = sqlCtx.createDataFrame(basicList)
42 |     val restrictedDF = inputDF.select($"place", $"pandaType", $"happyPandas", $"totalPandas")
43 |     val switched = inputDF.as[PandaInfo]
44 |     // Note if we write the filter with functional syntax it does not push down.
45 |     val filtered = switched.filter($"place" === "a")
46 |     assert(filtered.count() === 1)
47 |   }
48 | }
49 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/native/NativeExample.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Test our simple JNI
 3 |  */
 4 | package com.highperformancespark.examples.ffi
 5 | 
 6 | import com.holdenkarau.spark.testing._
 7 | import org.scalacheck.Arbitrary
 8 | import org.scalacheck.Gen
 9 | import org.scalacheck.Prop.forAll
10 | import org.scalatest.funsuite.AnyFunSuite
11 | import org.scalatest.matchers.should.Matchers._
12 | import org.scalatestplus.scalacheck.Checkers
13 | 
14 | class NativeExampleSuite extends AnyFunSuite
15 |     with SharedSparkContext with Checkers with RDDComparisons {
16 | 
17 |   test("local sum") {
18 |     val input = Array(1, 2, 3)
19 |     val sumMagic = new SumJNI()
20 |     val result = sumMagic.sum(input)
21 |     val expected = 6
22 |     assert(result === expected)
23 |   }
24 | 
25 |   test("super simple test") {
26 |     val input = sc.parallelize(List(("hi", Array(1, 2, 3))))
27 |     val result = NativeExample.jniSum(input).collect()
28 |     val expected = List(("hi", 6))
29 |     assert(result === expected)
30 |   }
31 | 
32 |   test("native call should find sum correctly") {
33 |     val property = forAll(
34 |       RDDGenerator.genRDD[(String, Array[Int])](sc)(
35 |         Arbitrary.arbitrary[(String, Array[Int])])) {
36 |       rdd =>
37 |         val expected = rdd.mapValues(_.sum)
38 |         val result = NativeExample.jniSum(rdd)
39 |         compareRDDWithOrder(expected, result).isEmpty
40 |     }
41 |     check(property)
42 |   }
43 | 
44 |   test("JNA support") {
45 |     val input = Array(1, 2, 3)
46 |     assert(6 === SumJNA.sum(input, input.size))
47 |   }
48 | 
49 |   test("JNA Fortran support") {
50 |     val input = Array(1, 2, 3)
51 |     assert(6 === SumFJNA.easySum(input.size, input))
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/dataframe/SQLExtensionTest.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Happy Panda Example for DataFrames.
 3 |  * Computes the % of happy pandas. Very contrived.
 4 |  */
 5 | package com.highperformancespark.examples.dataframe
 6 | 
 7 | import scala.collection.mutable
 8 | import scala.util.Random
 9 | 
10 | import org.apache.spark.SparkConf
11 | import org.apache.spark.sql._
12 | import org.apache.spark.sql.execution.ExplainMode
13 | import org.apache.spark.sql.types.IntegerType
14 | import org.apache.spark.sql.functions.{lower, rand}
15 | import org.apache.spark.sql.types._
16 | 
17 | import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
18 | import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
19 | import com.holdenkarau.spark.testing._
20 | import org.scalatest.funsuite.AnyFunSuite
21 | import org.scalatest.matchers.should.Matchers._
22 | 
23 | class SQLExtensionTest extends AnyFunSuite with ScalaDataFrameSuiteBase {
24 | 
25 |   val rawPandaList = List(
26 |     RawPanda(10L, "94110", "giant", true, Array(1.0, 0.9)),
27 |     RawPanda(11L, "94110", "red", true, Array(1.0, 0.9)))
28 | 
29 |   override def conf: SparkConf = {
30 |     val initialConf = super.conf
31 |     initialConf.set(
32 |       "spark.sql.extensions",
33 |       "com.highperformancespark.examples.dataframe.SQLExtension")
34 |   }
35 | 
36 |   def explainToString(df: DataFrame): String = {
37 |     df.queryExecution.explainString(ExplainMode.fromString("extended"))
38 |   }
39 | 
40 |   test("Magic") {
41 |     import spark.implicits._
42 |     val inputDF = spark.createDataFrame(rawPandaList)
43 |     spark.sql("DROP TABLE IF EXISTS farts")
44 |     inputDF.write.saveAsTable("farts")
45 |     val testDF = spark.read.table("farts")
46 |     val explained: String = explainToString(testDF.select($"zip".cast(IntegerType)))
47 |     explained should include ("isnotnull(zip#")
48 |   }
49 | }
50 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/UDFs.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Example UDFs
 3 |  */
 4 | package com.highperformancespark.examples.dataframe
 5 | 
 6 | import org.apache.spark.sql._
 7 | import org.apache.spark.sql.expressions._
 8 | import org.apache.spark.sql.types._
 9 | 
10 | object UDFs {
11 |   //tag::setupUDFs[]
12 |   def setupUDFs(sqlCtx: SQLContext) = {
13 |     sqlCtx.udf.register("strLen", (s: String) => s.length())
14 |   }
15 |   //end::setupUDFs[]
16 | 
17 |   //tag::setupUDAFs[]
18 |   def setupUDAFs(sqlCtx: SQLContext) = {
19 |     class Avg extends UserDefinedAggregateFunction {
20 |       // Input type
21 |       def inputSchema: org.apache.spark.sql.types.StructType =
22 |         StructType(StructField("value", DoubleType) :: Nil)
23 | 
24 |       def bufferSchema: StructType = StructType(
25 |         StructField("count", LongType) ::
26 |         StructField("sum", DoubleType) :: Nil
27 |       )
28 | 
29 |       // Return type
30 |       def dataType: DataType = DoubleType
31 | 
32 |       def deterministic: Boolean = true
33 | 
34 |       def initialize(buffer: MutableAggregationBuffer): Unit = {
35 |         buffer(0) = 0L
36 |         buffer(1) = 0.0
37 |       }
38 | 
39 |       def update(buffer: MutableAggregationBuffer,input: Row): Unit = {
40 |         buffer(0) = buffer.getAs[Long](0) + 1
41 |         buffer(1) = buffer.getAs[Double](1) + input.getAs[Double](0)
42 |       }
43 | 
44 |       def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
45 |         buffer1(0) = buffer1.getAs[Long](0) + buffer2.getAs[Long](0)
46 |         buffer1(1) = buffer1.getAs[Double](1) + buffer2.getAs[Double](1)
47 |       }
48 | 
49 |       def evaluate(buffer: Row): Any = {
50 |         buffer.getDouble(1) / buffer.getLong(0)
51 |       }
52 |     }
53 |     // Optionally register
54 |     val avg = new Avg
55 |     sqlCtx.udf.register("ourAvg", avg)
56 |   }
57 |   //end::setupUDAFs[]
58 | }
59 | 


--------------------------------------------------------------------------------
/python/examples/pandera_ex.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.session import SparkSession
 2 | 
 3 | # tag::pandera_imports[]
 4 | import pandera.pyspark as pa
 5 | import pyspark.sql.types as T
 6 | 
 7 | # end::pandera_imports[]
 8 | 
 9 | 
10 | # tag::simple_data_schema[]
11 | class ProjectDataSchema(pa.DataFrameModel):
12 |     # Note str_length is currently broken :/
13 |     creator: T.StringType() = pa.Field(str_length={"min_value": 1})
14 |     projectname: T.StringType() = pa.Field()
15 |     stars: T.IntegerType() = pa.Field(ge=0)
16 | 
17 | 
18 | # end::simple_data_schema[]
19 | 
20 | 
21 | # tag::gender_data[]
22 | class GenderData(pa.DataFrameModel):
23 |     MaleBonusPercent: T.DoubleType() = pa.Field(nullable=True, le=5)
24 |     FemaleBonusPercent: T.DoubleType() = pa.Field(nullable=True)
25 |     CompanyNumber: T.IntegerType() = pa.Field()
26 | 
27 | 
28 | # end::gender_data[]
29 | 
30 | if __name__ == "__main__":
31 |     spark = SparkSession.builder.master("local[4]").getOrCreate()
32 |     # Make sure to make
33 |     # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021"
34 |     # available as ./data/2021
35 |     uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
36 | 
37 |     # tag::validate_gender_data[]
38 |     validated_df = GenderData(uk_df)
39 |     # Print out the errors. You may wish to exit with an error condition.
40 |     if validated_df.pandera.errors != {}:
41 |         print(validated_df.pandera.errors)
42 |         # sys.exit(1)
43 |     # end::validate_gender_data[]
44 | 
45 |     # tag::validate_project_data[]
46 |     project_data = spark.read.csv("./data/project.csv", header=True, inferSchema=True)
47 |     validated_df = ProjectDataSchema(project_data)
48 |     # Print out the errors. You may wish to exit with an error condition.
49 |     if validated_df.pandera.errors != {}:
50 |         print(validated_df.pandera.errors)
51 |         # sys.exit(1)
52 |     # end::validate_project_data[]
53 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | *.class
  2 | *.log
  3 | 
  4 | # sbt specific
  5 | .cache
  6 | .history
  7 | .lib/
  8 | dist/*
  9 | target/
 10 | lib_managed/
 11 | src_managed/
 12 | project/boot/
 13 | project/plugins/project/
 14 | .bsp
 15 | 
 16 | # Scala-IDE specific
 17 | .scala_dependencies
 18 | .worksheet
 19 | .idea/
 20 | 
 21 | # emacs stuff
 22 | \#*\#
 23 | \.\#*
 24 | *~
 25 | sbt/*launch*.jar
 26 | 
 27 | # VSCode specific
 28 | .vscode
 29 | .history
 30 | 
 31 | # Metals
 32 | .metals
 33 | .bloop
 34 | metals.sbt
 35 | 
 36 | # python
 37 | *.pyc
 38 | .tox
 39 | .bsp
 40 | 
 41 | # Distribution / packaging
 42 | .Python
 43 | build/
 44 | develop-eggs/
 45 | dist/
 46 | downloads/
 47 | eggs/
 48 | .eggs/
 49 | lib/
 50 | lib64/
 51 | parts/
 52 | sdist/
 53 | var/
 54 | wheels/
 55 | share/python-wheels/
 56 | *.egg-info/
 57 | .installed.cfg
 58 | *.egg
 59 | MANIFEST
 60 | # scala stuff
 61 | .metals
 62 | 
 63 | # native
 64 | *.o
 65 | *.so
 66 | *.so.0.0.0
 67 | *.so.0
 68 | 
 69 | # Spark files
 70 | *.tgz
 71 | iceberg-spark-runtime-*.jar
 72 | spark-*-bin-hadoop*/
 73 | 
 74 | # Warehouse
 75 | spark-warehouse/
 76 | warehouse/
 77 | metastore_db/
 78 | 
 79 | # Misc internal stuff
 80 | sql/*.sql.out
 81 | python/examples/*.py.out
 82 | data/fetched/*
 83 | spark_expectations_sample_rules.json
 84 | 
 85 | # more python
 86 | pyspark_venv.tar.gz
 87 | pyspark_venv/
 88 | 
 89 | # accel stuff
 90 | accelerators/*.jar
 91 | accelerators/arrow-datafusion-comet
 92 | # ignore gluten
 93 | gluten
 94 | gluten*.jar
 95 | spark-3*hadoop*/
 96 | spark-3*hadoop*.tgz
 97 | accelerators/incubator-gluten
 98 | # ignore the temporary myapp from the dockerbuild
 99 | myapp.tar
100 | # ignore glutten
101 | incubator-glutten/*
102 | # ignore nested build file.
103 | project/build.sbt
104 | coursier
105 | # Magic file we use for build tracking
106 | oldhash
107 | # ignore ipynb checkpoints
108 | .ipynb_checkpoints/
109 | 
110 | # ignore accel
111 | incubator-gluten/
112 | 


--------------------------------------------------------------------------------
/python/examples/SQLLineage.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import DataFrame, Row
 2 | from pyspark.sql.session import SparkSession
 3 | import sys
 4 | 
 5 | global df
 6 | global sc
 7 | global rdd
 8 | global spark
 9 | 
10 | 
11 | """
12 | >>> df = rdd.toDF()
13 | >>> df2 = cutLineage(df)
14 | >>> df.head() == df2.head()
15 | True
16 | >>> df.schema == df2.schema
17 | True
18 | """
19 | 
20 | 
21 | # tag::cutLineage[]
22 | def cutLineage(df):
23 |     """
24 |     Cut the lineage of a DataFrame - used for iterative algorithms
25 | 
26 |     .. Note: This uses internal members and may break between versions
27 |     >>> df = rdd.toDF()
28 |     >>> cutDf = cutLineage(df)
29 |     >>> cutDf.count()
30 |     3
31 |     """
32 |     jRDD = df._jdf.toJavaRDD()
33 |     jSchema = df._jdf.schema()
34 |     jRDD.cache()
35 |     session = df.sparkSession
36 |     javaSparkSession = session._jsparkSession
37 |     newJavaDF = javaSparkSession.createDataFrame(jRDD, jSchema)
38 |     newDF = DataFrame(newJavaDF, session)
39 |     return newDF
40 | 
41 | 
42 | # end::cutLineage[]
43 | 
44 | 
45 | def _setupTest():
46 |     globs = globals()
47 |     spark = SparkSession.builder.master("local[4]").getOrCreate()
48 |     sc = spark._sc
49 |     sc.setLogLevel("ERROR")
50 |     globs["sc"] = sc
51 |     globs["spark"] = spark
52 |     globs["rdd"] = sc.parallelize(
53 |         [
54 |             Row(field1=1, field2="row1"),
55 |             Row(field1=2, field2="row2"),
56 |             Row(field1=3, field2="row3"),
57 |         ]
58 |     )
59 |     return globs
60 | 
61 | 
62 | def _test():
63 |     """
64 |     Run the tests.
65 |     """
66 |     import doctest
67 | 
68 |     globs = _setupTest()
69 |     (failure_count, test_count) = doctest.testmod(
70 |         globs=globs, optionflags=doctest.ELLIPSIS
71 |     )
72 |     globs["sc"].stop()
73 |     if failure_count:
74 |         exit(-1)
75 | 
76 | 
77 | if __name__ == "__main__":
78 |     _test()
79 | # Hack to support running in nose
80 | elif sys.stdout != sys.__stdout__:
81 |     _setupTest()
82 | 


--------------------------------------------------------------------------------
/python/examples/udf.py:
--------------------------------------------------------------------------------
 1 | # This script triggers a number of different PySpark errors
 2 | 
 3 | from pyspark.sql.session import SparkSession
 4 | from pyspark.sql.functions import pandas_udf, udf
 5 | from typing import Iterator
 6 | import sys
 7 | import pandas as pd
 8 | 
 9 | global sc
10 | 
11 | 
12 | # We need the session before we can use @udf
13 | spark = SparkSession.builder.master("local[4]").getOrCreate()
14 | 
15 | 
16 | # tag::simple_udf[]
17 | @udf("long")
18 | def classic_add1(e: int) -> int:
19 |     return e + 1
20 | 
21 | 
22 | # end::simple_udf[]
23 | 
24 | 
25 | # tag::agg_new_udf[]
26 | @pandas_udf("long")
27 | def pandas_sum(s: pd.Series) -> int:
28 |     return s.sum()
29 | 
30 | 
31 | # end::agg_new_udf[]
32 | 
33 | 
34 | # tag::new_udf[]
35 | @pandas_udf("long")
36 | def pandas_add1(s: pd.Series) -> pd.Series:
37 |     # Vectorized operation on all of the elems in series at once
38 |     return s + 1
39 | 
40 | 
41 | # end::new_udf[]
42 | 
43 | 
44 | # tag::complex_udf[]
45 | @pandas_udf("long")
46 | def pandas_nested_add1(d: pd.DataFrame) -> pd.Series:
47 |     # Takes a struct and returns the age elem + 1, if we wanted
48 |     # to update (e.g. return struct) we could update d and return it instead.
49 |     return d["age"] + 1
50 | 
51 | 
52 | # end::complex_udf[]
53 | 
54 | 
55 | # tag::batches_of_batches_udf[]
56 | @pandas_udf("long")
57 | def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]:
58 |     my_db_connection = None  # Expensive setup logic goes here
59 |     for s in t:
60 |         # Do something with your setup logic
61 |         if my_db_connection is None:
62 |             # Vectorized operation on all of the elems in series at once
63 |             yield s + 1
64 | 
65 | 
66 | # end::batches_of_batches_udf[]
67 | 
68 | 
69 | if __name__ == "__main__":
70 |     # Make sure to make
71 |     # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021"
72 |     # available as ./data/2021
73 |     uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
74 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/native/PipeExample.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.highperformancespark.examples.ffi
18 | 
19 | import org.apache.spark.SparkContext
20 | import org.apache.spark.SparkFiles
21 | import org.apache.spark.rdd._
22 | 
23 | object PipeExample {
24 |   //tag::pipeExample[]
25 |   def lookupUserPRS(sc: SparkContext, input: RDD[Int]): RDD[(Int, List[String])] = {
26 |     // Copy our script to the worker nodes with sc.addFile
27 |     // Add file requires absolute paths
28 |     val distScriptName = "ghinfo.pl"
29 |     val userDir = System.getProperty("user.dir")
30 |     val localScript = s"${userDir}/src/main/perl/${distScriptName}"
31 |     val addedFile = sc.addFile(localScript)
32 | 
33 |     // Pass enviroment variables to our worker
34 |     val enviromentVars = Map("user" -> "apache", "repo" -> "spark")
35 |     val result = input.map(x => x.toString)
36 |       .pipe(SparkFiles.get(distScriptName), enviromentVars)
37 |     // Parse the results
38 |     result.map{record =>
39 |       val elems: Array[String] = record.split(" ")
40 |       (elems(0).toInt, elems.slice(1, elems.size).sorted.distinct.toList)
41 |     }
42 |   }
43 |   //end::pipeExample[]
44 | }
45 | 


--------------------------------------------------------------------------------
/core/src/main/r/wc.R:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | args <- commandArgs(trailing = TRUE)
19 | 
20 | if (length(args) != 1) {
21 |   print("Usage: wc.R <path-to-text-file>")
22 |   q("no")
23 | }
24 | 
25 | fileName <- args(1)
26 | 
27 | #tag::example[]
28 | 
29 | library(SparkR)
30 | 
31 | # Setup SparkContext & SQLContext
32 | sc <- sparkR.init(appName="high-performance-spark-wordcount-example")
33 | 
34 | # Initialize SQLContext
35 | sqlContext <- sparkRSQL.init(sc)
36 | 
37 | # Load some simple data
38 | 
39 | df <- read.text(fileName)
40 | 
41 | # Split the words
42 | words <- selectExpr(df, "split(value, \" \") as words")
43 | 
44 | # Compute the count
45 | explodedWords <- select(words, alias(explode(words$words), "words"))
46 | wc <- agg(groupBy(explodedWords, "words"), "words" = "count")
47 | 
48 | 
49 | # Attempting to push an array back fails
50 | # resultingSchema <- structType(structField("words", "array<string>"))
51 | # words <- dapply(df, function(line) {
52 | #   y <- list()
53 | #   y[[1]] <- strsplit(line[[1]], " ")
54 | # }, resultingSchema)
55 | # Also attempting even the identity transformation on a DF from read.text fails
56 | # in Spark 2.0-preview (although works fine on other DFs).
57 | 
58 | # Display the result
59 | showDF(wc)
60 | #end::example[]
61 | 


--------------------------------------------------------------------------------
/accelerators/setup_gluten_spark34.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | mkdir -p /tmp/spark-events
 4 | SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
 5 | ACCEL_JARS=${SCRIPT_DIR}
 6 | SPARK_MAJOR_VERSION=3.4
 7 | SCALA_VERSION=${SCALA_VERSION:-"2.12"}
 8 | 
 9 | set -ex
10 | 
11 | # Note: this does not work on Ubuntu 23, only on 22
12 | # You might get something like:
13 | # # C  [libgluten.so+0x30c753]  gluten::Runtime::registerFactory(std::string const&, std::function<gluten::Runtime* (std::unordered_map<std::string, std::string, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&)>)+0x23
14 | 
15 | 
16 | SPARK_VERSION=3.4.2
17 | SPARK_MAJOR=3.4
18 | HADOOP_VERSION=3
19 | SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
20 | SPARK_FILE="${SPARK_DIR}.tgz"
21 | 
22 | export SPARK_MAJOR
23 | export SPARK_VERSION
24 | 
25 | source setup_gluten_deps.sh
26 | 
27 | cd ..
28 | source /etc/lsb-release
29 | # Pre-baked only
30 | if [ "$DISTRIB_RELEASE" == "20.04" ]; then
31 |   source ./env_setup.sh
32 |   cd "${SCRIPT_DIR}"
33 | 
34 |   GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar"
35 |   GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar"
36 | 
37 |   if [ ! -f "${GLUTEN_JAR_PATH}" ]; then
38 |     wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" || unset GLUTEN_JAR_PATH
39 |   fi
40 | 
41 | fi
42 | # Rather than if/else we fall through to build if wget fails because major version is not supported.
43 | if [ -z "$GLUTEN_JAR_PATH" ]; then
44 |   #tag::build_gluten[]
45 |   if [ ! -d incubator-gluten ]; then
46 |     git clone https://github.com/apache/incubator-gluten.git
47 |   fi
48 |   cd incubator-gluten
49 |   sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON
50 |   mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests
51 |   GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar"
52 |   #end::build_gluten[]
53 | fi
54 | 
55 | export GLUTEN_JAR_PATH
56 | 
57 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Simple tests for our SimpleNaiveBayes demo pipeline stage
 3 |  */
 4 | package com.highperformancespark.examples.ml
 5 | 
 6 | import org.apache.spark.ml._
 7 | import org.apache.spark.ml.feature._
 8 | import org.apache.spark.ml.param._
 9 | import org.apache.spark.sql.DataFrame
10 | import org.apache.spark.sql.Dataset
11 | import org.apache.spark.sql.Row
12 | import org.apache.spark.sql.SQLContext
13 | import org.apache.spark.sql.types._
14 | 
15 | import com.highperformancespark.examples.dataframe.HappyPandas.PandaInfo
16 | import com.highperformancespark.examples.dataframe.HappyPandas.Pandas
17 | import com.holdenkarau.spark.testing._
18 | import org.scalatest.funsuite.AnyFunSuite
19 | import org.scalatest.matchers.should.Matchers._
20 | 
21 | case class MiniPanda(happy: Double, fuzzy: Double, old: Double)
22 | 
23 | class SimpleNaiveBayesSuite extends AnyFunSuite with DataFrameSuiteBase {
24 |   val miniPandasList = List(
25 |     MiniPanda(1.0, 1.0, 1.0),
26 |     MiniPanda(1.0, 1.0, 0.0),
27 |     MiniPanda(1.0, 1.0, 0.0),
28 |     MiniPanda(0.0, 0.0, 1.0),
29 |     MiniPanda(0.0, 0.0, 0.0))
30 | 
31 |   test("simple sanity test") {
32 |     val session = spark
33 |     import session.implicits._
34 |     val ds: Dataset[MiniPanda] = session.createDataset(miniPandasList)
35 |     val assembler = new VectorAssembler()
36 |     assembler.setInputCols(Array("fuzzy", "old"))
37 |     assembler.setOutputCol("magical_features")
38 |     val snb = new SimpleNaiveBayes()
39 |     snb.setLabelCol("happy")
40 |     snb.setFeaturesCol("magical_features")
41 |     val pipeline = new Pipeline().setStages(Array(assembler, snb))
42 |     val model = pipeline.fit(ds)
43 |     val test = ds.select("fuzzy", "old")
44 |     val predicted = model.transform(test)
45 |     assert(predicted.count() === miniPandasList.size)
46 |     val nbModel = model.stages(1).asInstanceOf[SimpleNaiveBayesModel]
47 |     assert(nbModel.getFeaturesCol === "magical_features")
48 |     assert(nbModel.copy(ParamMap.empty).getFeaturesCol === "magical_features")
49 |   }
50 | }
51 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/errors/throws.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.errors
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.rdd.RDD
 5 | 
 6 | object Throws {
 7 |   def throwInner(sc: SparkContext) = {
 8 |     //tag::throwInner1[]
 9 |     val data = sc.parallelize(List(1, 2, 3))
10 |     // Will throw an exception when forced to evaluate
11 |     val transform1 = data.map(x => x/0)
12 |     val transform2 = transform1.map(x => x + 1)
13 |     transform2.collect() // Forces evaluation
14 |     //end::throwInner1[]
15 |   }
16 | 
17 |   def throwOuter(sc: SparkContext) = {
18 |     //tag::throwOuter1[]
19 |     val data = sc.parallelize(List(1, 2, 3))
20 |     val transform1 = data.map(x => x + 1)
21 |     // Will throw an exception when forced to evaluate
22 |     val transform2 = transform1.map(x => x/0)
23 |     transform2.collect() // Forces evaluation
24 |     //end::throwOuter1[]
25 |   }
26 | 
27 |   //tag::badFunctions[]
28 |   def add1(x: Int): Int = {
29 |     x + 1
30 |   }
31 | 
32 |   def divZero(x: Int): Int = {
33 |     x / 0
34 |   }
35 |   //end::badFunctions[]
36 | 
37 |   //tag::badEx3[]
38 |   def throwInner2(sc: SparkContext) = {
39 |     val data = sc.parallelize(List(1, 2, 3))
40 |     // Will throw an exception when forced to evaluate
41 |     val transform1 = data.map(divZero)
42 |     val transform2 = transform1.map(add1)
43 |     transform2.collect() // Forces evaluation
44 |   }
45 | 
46 |   def throwOuter2(sc: SparkContext) = {
47 |     val data = sc.parallelize(List(1, 2, 3))
48 |     val transform1 = data.map(add1)
49 |     // Will throw an exception when forced to evaluate
50 |     val transform2 = transform1.map(divZero)
51 |     transform2.collect() // Forces evaluation
52 |   }
53 |   //end::badEx3
54 | 
55 |   def nonExistentInput(sc: SparkContext) = {
56 |     //tag::nonExistentInput[]
57 |     val input = sc.textFile("file:///doesnotexist.txt")
58 |     val data = input.map(x => x.toInt)
59 |     val transform = data.map(x => x + 1)
60 |     transform.collect() // Forces evaluation
61 |     //end::nonExistentInput[]
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/tools/SampleData.scala:
--------------------------------------------------------------------------------
 1 | import scala.reflect.ClassTag
 2 | import scala.util.Random
 3 | 
 4 | import org.apache.spark.SparkContext
 5 | import org.apache.spark.rdd.RDD
 6 | 
 7 | /**
 8 |  * Sample our production data to be able to use it for tests
 9 |  */
10 | object SampleData {
11 |   /**
12 |    * Sample the input down to k % for usage in tests
13 |    */
14 |   def sampleInput[T](rdd: RDD[T]): RDD[T] = {
15 |   // tag::randomSampleInput[]
16 |     rdd.sample(withReplacement=false, fraction=0.1)
17 |   // end::randomSampleInput[]
18 |   }
19 | 
20 |   /**
21 |    * Construct a stratified sample
22 |    */
23 |   def stratifiedSample(rdd: RDD[(String, Array[Double])]):
24 |       RDD[(String, Array[Double])] = {
25 |     // tag::stratifiedSample[]
26 |     // 5% of the red pandas, and 50% of the giant pandas
27 |     val stratas = Map("red" -> 0.05, "giant" -> 0.50)
28 |     rdd.sampleByKey(withReplacement=false, fractions = stratas)
29 |     // end::stratifiedSample[]
30 |   }
31 | 
32 |   /**
33 |    * Custom random sample with RNG. This is intended as an example of how
34 |    * to save setup overhead.
35 |    */
36 |   def slowSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = {
37 |     rdd.flatMap{x => val r = new Random()
38 |       if (r.nextInt(10) == 0) {
39 |         Some(x)
40 |       } else {
41 |         None
42 |       }}
43 |   }
44 | 
45 |   /**
46 |    * Custom random sample with RNG. This is intended as an example of how to
47 |    * save setup overhead.
48 |    */
49 |   def customSampleInput[T: ClassTag](rdd: RDD[T]): RDD[T] = {
50 |     // tag::mapPartitions[]
51 |     rdd.mapPartitions{itr =>
52 |       // Only create once RNG per partitions
53 |       val r = new Random()
54 |       itr.filter(x => r.nextInt(10) == 0)
55 |     }
56 |     // end::mapPartitions[]
57 |   }
58 | 
59 |   // tag::broadcast[]
60 |   class LazyPrng {
61 |     @transient lazy val r = new Random()
62 |   }
63 |   def customSampleBroadcast[T: ClassTag](sc: SparkContext, rdd: RDD[T]): RDD[T]= {
64 |     val bcastprng = sc.broadcast(new LazyPrng())
65 |     rdd.filter(x => bcastprng.value.r.nextInt(10) == 0)
66 |   }
67 |   // end::broadcast[]
68 | }
69 | 


--------------------------------------------------------------------------------
/conf/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Licensed to the Apache Software Foundation (ASF) under one or more
 3 | # contributor license agreements.  See the NOTICE file distributed with
 4 | # this work for additional information regarding copyright ownership.
 5 | # The ASF licenses this file to You under the Apache License, Version 2.0
 6 | # (the "License"); you may not use this file except in compliance with
 7 | # the License.  You may obtain a copy of the License at
 8 | #
 9 | #    http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Set everything to be logged to the console
19 | log4j.rootCategory=ERROR, console
20 | log4j.appender.console=org.apache.log4j.ConsoleAppender
21 | log4j.appender.console.target=System.err
22 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
23 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
24 | 
25 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
26 | # log level for this class is used to overwrite the root logger's log level, so that
27 | # the user can have different defaults for the shell and regular Spark apps.
28 | log4j.logger.org.apache.spark.repl.Main=ERROR
29 | 
30 | # Settings to quiet third party logs that are too verbose
31 | log4j.logger.org.spark-project.jetty=ERROR
32 | log4j.logger.org.spark-project.jetty.util.component.AbstractLifeCycle=ERROR
33 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
34 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
35 | log4j.logger.org.apache.parquet=ERROR
36 | log4j.logger.parquet=ERROR
37 | 
38 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
39 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
40 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
41 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/wordcount/WordCountTest.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.wordcount
 2 | 
 3 | 
 4 | import com.holdenkarau.spark.testing.SharedSparkContext
 5 | import org.scalatest.funsuite.AnyFunSuite
 6 | 
 7 | class WordCountTest extends AnyFunSuite with SharedSparkContext {
 8 |   test("word count with Stop Words Removed"){
 9 |     val wordRDD = sc.parallelize(Seq(
10 |       "How happy was the panda? You ask.",
11 |       "Panda is the most happy panda in all the #$!?ing land!"))
12 | 
13 |     val stopWords: Set[String] = Set("a", "the", "in", "was", "there", "she", "he")
14 |     val illegalTokens: Array[Char] = "#$%?!.".toCharArray
15 | 
16 |     val wordCounts = WordCount.withStopWordsFiltered(
17 |       wordRDD, illegalTokens, stopWords)
18 |     val wordCountsAsMap = wordCounts.collectAsMap()
19 |     assert(!wordCountsAsMap.contains("the"))
20 |     assert(!wordCountsAsMap.contains("?"))
21 |     assert(!wordCountsAsMap.contains("#$!?ing"))
22 |     assert(wordCountsAsMap.contains("ing"))
23 |     assert(wordCountsAsMap.get("panda").get.equals(3))
24 |   }
25 | 
26 |   test("word count with simple counting") {
27 |     val wordRDD = sc.parallelize(
28 |       Seq(
29 |         "a b c d",
30 |         "b c d e"
31 |       )
32 |     )
33 |     val wordCounts = WordCount.simpleWordCount(wordRDD)
34 | 
35 |     val wordCountsAsMap = wordCounts.collectAsMap()
36 | 
37 |     for (character <- 'a' to 'e') {
38 |       assert(wordCountsAsMap.contains(character.toString))
39 |     }
40 |     for (character <- 'b' to 'd') {
41 |       assert(wordCountsAsMap.get(character.toString).get == 2)
42 |     }
43 |   }
44 | 
45 |   test("word count with bad idea") {
46 |     val wordRDD = sc.parallelize(
47 |       Seq(
48 |         "a b c d",
49 |         "b c d e"
50 |       )
51 |     )
52 |     val wordCounts = WordCount.badIdea(wordRDD)
53 | 
54 |     val wordCountsAsMap = wordCounts.collectAsMap()
55 | 
56 |     for (character <- 'a' to 'e') {
57 |       assert(wordCountsAsMap.contains(character.toString))
58 |     }
59 |     for (character <- 'b' to 'd') {
60 |       assert(wordCountsAsMap.get(character.toString).get == 2)
61 |     }
62 |   }
63 | }
64 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/tools/FilterInvalidPandas.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.tools
 2 | 
 3 | import scala.collection.immutable.HashSet
 4 | 
 5 | import org.apache.spark._
 6 | import org.apache.spark.rdd.RDD
 7 | 
 8 | import com.highperformancespark.examples.dataframe.RawPanda
 9 | //tag::loggerImport[]
10 | import org.apache.logging.log4j.LogManager
11 | //end::loggerImport[]
12 | 
13 | object FilterInvalidPandas {
14 | 
15 |   def filterInvalidPandas(sc: SparkContext, invalidPandas: List[Long],
16 |     input: RDD[RawPanda]) = {
17 |     //tag::broadcast[]
18 |     val invalid: HashSet[Long] = HashSet() ++ invalidPandas
19 |     val invalidBroadcast = sc.broadcast(invalid)
20 |     input.filter{panda => !invalidBroadcast.value.contains(panda.id)}
21 |     //end::broadcast[]
22 |   }
23 | 
24 |   def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long],
25 |     input: RDD[RawPanda]) = {
26 |     //tag::broadcastAndLog[]
27 |     val invalid: HashSet[Long] = HashSet() ++ invalidPandas
28 |     val invalidBroadcast = sc.broadcast(invalid)
29 |     def keepPanda(pandaId: Long) = {
30 |       val logger = LogManager.getLogger("fart based logs")
31 |       if (invalidBroadcast.value.contains(pandaId)) {
32 |         logger.debug("hi")
33 |         false
34 |       } else {
35 |         true
36 |       }
37 |     }
38 |     input.filter{panda => keepPanda(panda.id)}
39 |     //end::broadcastAndLog[]
40 |   }
41 | }
42 | 
43 | //tag::broadcastAndLogClass[]
44 | class AltLog() {
45 |   lazy val logger = LogManager.getLogger("fart based logs")
46 |   def filterInvalidPandasWithLogs(sc: SparkContext, invalidPandas: List[Long],
47 |       input: RDD[RawPanda]) = {
48 |     val invalid: HashSet[Long] = HashSet() ++ invalidPandas
49 |     val invalidBroadcast = sc.broadcast(invalid)
50 |     def keepPanda(pandaId: Long) = {
51 |       val logger = LogManager.getLogger("fart based logs")
52 |       if (invalidBroadcast.value.contains(pandaId)) {
53 |         logger.debug("hi")
54 |         false
55 |       } else {
56 |         true
57 |       }
58 |     }
59 |     input.filter{panda => keepPanda(panda.id)}
60 |   }
61 | }
62 | //end::broadcastAndLogClass[]
63 | 


--------------------------------------------------------------------------------
/core/src/main/perl/README:
--------------------------------------------------------------------------------
 1 | HighPerformanceSpark-Examples
 2 | 
 3 | The README is used to introduce the module and provide instructions on
 4 | how to install the module, any machine dependencies it may have (for
 5 | example C compilers and installed libraries) and any other information
 6 | that should be provided before the module is installed.
 7 | 
 8 | A README file is required for CPAN modules since CPAN extracts the README
 9 | file from a module distribution so that people browsing the archive
10 | can use it to get an idea of the module's uses. It is usually a good idea
11 | to provide version information here so that people can decide whether
12 | fixes for the module are worth downloading.
13 | 
14 | 
15 | INSTALLATION
16 | 
17 | To install this module, run the following commands:
18 | 
19 | 	perl Makefile.PL
20 | 	make
21 | 	make test
22 | 	make install
23 | 
24 | SUPPORT AND DOCUMENTATION
25 | 
26 | After installing, you can find documentation for this module with the
27 | perldoc command.
28 | 
29 |     perldoc HighPerformanceSpark::Examples
30 | 
31 | You can also look for information at:
32 | 
33 |     RT, CPAN's request tracker (report bugs here)
34 |         http://rt.cpan.org/NoAuth/Bugs.html?Dist=HighPerformanceSpark-Examples
35 | 
36 |     AnnoCPAN, Annotated CPAN documentation
37 |         http://annocpan.org/dist/HighPerformanceSpark-Examples
38 | 
39 |     CPAN Ratings
40 |         http://cpanratings.perl.org/d/HighPerformanceSpark-Examples
41 | 
42 |     Search CPAN
43 |         http://search.cpan.org/dist/HighPerformanceSpark-Examples/
44 | 
45 | 
46 | LICENSE AND COPYRIGHT
47 | 
48 | Copyright (C) 2016 Holden Karau And Rachel Warren
49 | 
50 | Licensed under the Apache License, Version 2.0 (the "License");
51 | you may not use this file except in compliance with the License.
52 | You may obtain a copy of the License at
53 | 
54 |     L<http://www.apache.org/licenses/LICENSE-2.0>
55 | 
56 | Unless required by applicable law or agreed to in writing, software
57 | distributed under the License is distributed on an "AS IS" BASIS,
58 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
59 | See the License for the specific language governing permissions and
60 | limitations under the License.
61 | 
62 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala_back:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * A sample mixing relational & functional transformations with Datasets.
 3 |  */
 4 | package com.highperformancespark.examples.dataframe
 5 | 
 6 | import org.apache.spark._
 7 | import org.apache.spark.rdd.RDD
 8 | import org.apache.spark.sql._
 9 | import org.apache.spark.sql.catalyst.expressions.aggregate._
10 | import org.apache.spark.sql.expressions._
11 | import org.apache.spark.sql.functions._
12 | import org.apache.spark.sql.types._
13 | // Additional imports for using HiveContext
14 | import org.apache.spark.sql.hive._
15 | import org.apache.spark.sql.hive.thriftserver._
16 | 
17 | class MixedDataset(sqlCtx: SQLContext) {
18 |   import sqlCtx.implicits._
19 | 
20 |   /**
21 |    * A sample function on a Dataset of RawPandas.
22 |    * This is contrived, since our reduction could also be done with SQL aggregates, but
23 |    * we can see the flexibility of being able to specify arbitrary Scala code.
24 |    */
25 |   def happyPandaSums(ds: Dataset[RawPanda]): Double = {
26 |     ds.toDF().filter($"happy" === true).as[RawPanda].
27 |       select($"attributes"(0).as[Double]).
28 |       reduce((x, y) => x + y)
29 |   }
30 | 
31 |   /**
32 |    * Functional map + Dataset, sums the positive attributes for the pandas
33 |    */
34 |   def funMap(ds: Dataset[RawPanda]): Dataset[Double] = {
35 |     ds.map{rp => rp.attributes.filter(_ > 0).sum}
36 |   }
37 | 
38 |   /**
39 |    * Illustrate how we make typed queries, using some of the float properties to produce boolean
40 |    * values.
41 |    */
42 |   def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = {
43 |     ds.select($"attributes"(0).as[Double])
44 |   }
45 | 
46 |   /**
47 |    * Illustrate converting a Dataset to an RDD
48 |    */
49 |   def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = {
50 |     ds.rdd
51 |   }
52 | 
53 |   /**
54 |    * Illustrate converting a Dataset to a DataFrame
55 |    */
56 |   def toDF(ds: Dataset[RawPanda]): DataFrame = {
57 |     ds.toDF()
58 |   }
59 | 
60 |   /**
61 |    * Illustrate DataFrame to Dataset. Its important to note that if the schema does not match what
62 |    * is expected by the Dataset this fails fast.
63 |    */
64 |   def fromDF(df: DataFrame): Dataset[RawPanda] = {
65 |     df.as[RawPanda]
66 |   }
67 | }
68 | 


--------------------------------------------------------------------------------
/run_sql_examples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | set -ex
 3 | set -o pipefail
 4 | 
 5 | source env_setup.sh
 6 | 
 7 | # You might want to set SPARK_EXTRA to do things like log more info
 8 | 
 9 | function run_example () {
10 |   local sql_file="$1"
11 |   local extra="$2"
12 |   EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions
13 |   if [ -n "$EXTRA_EXTENSIONS" ]; then
14 |     EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS"
15 |   fi
16 |   # shellcheck disable=SC2046,SC2086
17 |   ${SPARK_HOME}/bin/spark-sql --master local[5] \
18 | 	    --conf spark.eventLog.enabled=true \
19 | 	    --conf spark.sql.extensions=$EXTENSIONS \
20 | 	    --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
21 | 	    --conf spark.sql.catalog.spark_catalog.type=hive \
22 | 	    --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
23 | 	    --conf spark.sql.catalog.local.type=hadoop \
24 | 	    --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \
25 | 	    ${extra} ${SPARK_EXTRA} \
26 | 	    $(cat "${sql_file}.conf" || echo "") \
27 | 	    --name "${sql_file}" \
28 | 	    -f "${sql_file}" 2>&1 | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail"
29 | }
30 | 
31 | 
32 | # If you want to look at them
33 | # ${SPARK_PATH}/sbin/start-history-server.sh
34 | 
35 | if [ $# -eq 1 ]; then
36 |   if [[ "$1" != *"gluten_only"* ]]; then
37 |     run_example "sql/$1"
38 |   else
39 |     echo "Processing gluten ${sql_file}"
40 |     # shellcheck disable=SC2046
41 |     run_example "$sql_file"
42 |   fi
43 | else
44 |   # For each SQL
45 |   for sql_file in sql/*.sql; do
46 |     if [[ "$sql_file" != *"_only"* ]]; then
47 |       echo "Processing ${sql_file}"
48 |       # shellcheck disable=SC2046
49 |       run_example "$sql_file"
50 |     elif [[ "$sql_file" != *"gluten_only"* && "$GLUTEN_EXISTS" == "true" ]]; then
51 |       echo "Processing gluten ${sql_file}"
52 |       # shellcheck disable=SC2046
53 |       run_example "$sql_file"
54 |     elif [[ "$sql_file" != *"gluten_udf_only"* && "$GLUTEN_UDF_EXISTS" == "true" ]]; then
55 |       echo "Processing gluten UDF ${sql_file}"
56 |       # shellcheck disable=SC2046
57 |       run_example "$sql_file"
58 |     else
59 |       echo "Skipping $sql_file since we did not find gluten and this is restricted example."
60 |     fi
61 |   done
62 | fi
63 | 


--------------------------------------------------------------------------------
/native/src/CMakeLists.txt:
--------------------------------------------------------------------------------
 1 | ################################################################
 2 | # A minimal CMake file that is compatible with sbt-jni         #
 3 | #                                                              #
 4 | # All settings required by sbt-jni have been marked so, please #
 5 | # add/modify/remove settings to build your specific library.   #
 6 | ################################################################
 7 | 
 8 | cmake_minimum_required(VERSION 3.12)
 9 | 
10 | option(SBT "Set if invoked from sbt-jni" OFF)
11 | 
12 | # Define project and related variables
13 | # (required by sbt-jni) please use semantic versioning
14 | #
15 | project (high-performance-spark)
16 | enable_language(Fortran)
17 | set(PROJECT_VERSION_MAJOR 0)
18 | set(PROJECT_VERSION_MINOR 0)
19 | set(PROJECT_VERSION_PATCH 0)
20 | 
21 | set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR})
22 | 
23 | #tag::velox[]
24 | set (GLUTEN_LIB_NAME ${PROJECT_NAME}-gluten-${PROJECT_VERSION_MAJOR})
25 | # For gluten+velox, you can leave out if not using gluten
26 | set(GLUTEN_HOME ../../gluten)
27 | set(CMAKE_FIND_DEBUG_MODE TRUE)
28 | find_library(VELOX_LIBRARY NAMES velox HINTS
29 | 	     ${GLUTEN_HOME}/cpp/build/releases NO_DEFAULT_PATH)
30 | # End gluten specific
31 | 
32 | if(VELOX_LIBRARY)
33 |     file(GLOB GLUTEN_UDF_FILES
34 |       "./c/gluten/*.cpp")
35 |     add_library(${GLUTEN_LIB_NAME} SHARED ${GLUTEN_UDF_FILES})
36 |     target_include_directories(${GLUTEN_LIB_NAME} PRIVATE ${GLUTEN_HOME}/cpp ${GLUTEN_HOME}/ep/build-velox/build/velox_ep)
37 |     target_link_libraries(${GLUTEN_LIB_NAME} PRIVATE ${VELOX_LIBRARY})
38 | else()
39 |     message(WARNING "Velox library not found. Specific path not added.")
40 | endif()
41 | #end::velox[]
42 | 
43 | # Setup JNI
44 | find_package(JNI REQUIRED)
45 | if (JNI_FOUND)
46 |     message (STATUS "JNI include directories: ${JNI_INCLUDE_DIRS}")
47 | endif()
48 | 
49 | # Include directories
50 | include_directories(.)
51 | include_directories(include)
52 | include_directories(${JNI_INCLUDE_DIRS})
53 | 
54 | # Sources
55 | file(GLOB LIB_SRC
56 |   "*.c"
57 |   "*.f95"
58 |   "*.f*"
59 |   "*.cc"
60 |   "*.cpp"
61 |   "./c/*.c"
62 |   "./c/*.cpp"
63 |   "./fortran/*.f95"
64 |   "./fortran/*.f*"
65 | )
66 | 
67 | # Setup installation targets
68 | # (required by sbt-jni) major version should always be appended to library name
69 | #
70 | add_library(${LIB_NAME} SHARED ${LIB_SRC})
71 | install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .)
72 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/dataframe/JavaUDFs.java:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.dataframe;
 2 | 
 3 | import org.apache.spark.sql.Row;
 4 | import org.apache.spark.sql.SQLContext;
 5 | import org.apache.spark.sql.expressions.MutableAggregationBuffer;
 6 | import org.apache.spark.sql.expressions.UserDefinedAggregateFunction;
 7 | import org.apache.spark.sql.types.*;
 8 | 
 9 | public class JavaUDFs {
10 | 
11 |   public static void setupUDFs(SQLContext sqlContext) {
12 |     //tag::basicUDF[]
13 |     sqlContext.udf()
14 |       .register("strlen",
15 |                 (String s) -> s.length(), DataTypes.StringType);
16 |     //end::basicUDF[]
17 |   }
18 | 
19 |   public static void setupUDAFs(SQLContext sqlContext) {
20 | 
21 |     class Avg extends UserDefinedAggregateFunction {
22 | 
23 |       @Override
24 |       public StructType inputSchema() {
25 |         StructType inputSchema =
26 |           new StructType(new StructField[]{new StructField("value", DataTypes.DoubleType, true, Metadata.empty())});
27 |         return inputSchema;
28 |       }
29 | 
30 |       @Override
31 |       public StructType bufferSchema() {
32 |         StructType bufferSchema =
33 |           new StructType(new StructField[]{
34 |             new StructField("count", DataTypes.LongType, true, Metadata.empty()),
35 |             new StructField("sum", DataTypes.DoubleType, true, Metadata.empty())
36 |           });
37 | 
38 |         return bufferSchema;
39 |       }
40 | 
41 |       @Override
42 |       public DataType dataType() {
43 |         return DataTypes.DoubleType;
44 |       }
45 | 
46 |       @Override
47 |       public boolean deterministic() {
48 |         return true;
49 |       }
50 | 
51 |       @Override
52 |       public void initialize(MutableAggregationBuffer buffer) {
53 |         buffer.update(0, 0L);
54 |         buffer.update(1, 0.0);
55 |       }
56 | 
57 |       @Override
58 |       public void update(MutableAggregationBuffer buffer, Row input) {
59 |         buffer.update(0, buffer.getLong(0) + 1);
60 |         buffer.update(1, buffer.getDouble(1) + input.getDouble(0));
61 |       }
62 | 
63 |       @Override
64 |       public void merge(MutableAggregationBuffer buffer1, Row buffer2) {
65 |         buffer1.update(0, buffer1.getLong(0) + buffer2.getLong(0));
66 |         buffer1.update(1, buffer1.getDouble(1) + buffer2.getDouble(1));
67 |       }
68 | 
69 |       @Override
70 |       public Object evaluate(Row buffer) {
71 |         return buffer.getDouble(1) / buffer.getLong(0);
72 |       }
73 |     }
74 | 
75 |     Avg average = new Avg();
76 |     sqlContext.udf().register("ourAvg", average);
77 |   }
78 | }
79 | 


--------------------------------------------------------------------------------
/env_setup.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | # Download Spark and iceberg if not present
 6 | SPARK_MAJOR=${SPARK_MAJOR:-"3.5"}
 7 | SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.3"}
 8 | SCALA_VERSION=${SCALA_VERSION:-"2.13"}
 9 | HADOOP_VERSION="3"
10 | SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
11 | SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
12 | if [ "$SCALA_VERSION" = "2.13" ]; then
13 |   SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3-scala2.13.tgz"
14 |   SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}-scala2.13"
15 | fi
16 | ICEBERG_VERSION=${ICEBERG_VERSION:-"1.9.2"}
17 | if [ ! -f "${SPARK_FILE}" ]; then
18 |   SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}"
19 |   SPARK_ARCHIVE_DIST_URL="https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_FILE}"
20 |   if command -v axel &> /dev/null
21 |   then
22 |     (axel --quiet "$SPARK_DIST_URL" || axel --quiet "$SPARK_ARCHIVE_DIST_URL") &
23 |   else
24 |     (wget --quiet "$SPARK_DIST_URL" || wget --quiet "$SPARK_ARCHIVE_DIST_URL") &
25 |   fi
26 | fi
27 | # Download Icberg if not present
28 | ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar"
29 | if [ ! -f "${ICEBERG_FILE}" ]; then
30 |   wget --quiet "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" &
31 | fi
32 | wait
33 | sleep 1
34 | # Setup the env
35 | if [ ! -d "${SPARK_PATH}" ]; then
36 |   tar -xf "${SPARK_FILE}"
37 | fi
38 | 
39 | SPARK_HOME="${SPARK_PATH}"
40 | export SPARK_HOME
41 | 
42 | if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then
43 |   # Delete the old JAR first.
44 |   rm "${SPARK_PATH}/jars/iceberg-spark-runtime*.jar" || echo "No old version to delete."
45 |   cp "${ICEBERG_FILE}" "${SPARK_PATH}/jars/${ICEBERG_FILE}"
46 | fi
47 | 
48 | # Set up for running pyspark and friends
49 | export PATH="${SPARK_PATH}:${SPARK_PATH}/python:${SPARK_PATH}/bin:${SPARK_PATH}/sbin:${PATH}"
50 | 
51 | # Make sure we have a history directory
52 | mkdir -p /tmp/spark-events
53 | 
54 | mkdir -p ./data/fetched/
55 | if [ ! -f ./data/fetched/2021 ]; then
56 |   wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021
57 | fi
58 | if [ ! -f ./data/fetched/2022 ]; then
59 |   wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022
60 | fi
61 | if [ ! -f ./data/fetched/2023 ]; then
62 |   wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023
63 | fi
64 | 
65 | 


--------------------------------------------------------------------------------
/run_pyspark_examples.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | # shellcheck disable=SC1091,SC2034
 3 | 
 4 | source env_setup.sh
 5 | 
 6 | set -ex
 7 | 
 8 | set -o pipefail
 9 | 
10 | #tag::package_venv[]
11 | if [ ! -d pyspark_venv ]; then
12 |   python -m venv pyspark_venv
13 | fi
14 | 
15 | source pyspark_venv/bin/activate
16 | pip install -r ./python/requirements.txt
17 | 
18 | if [ ! -f pyspark_venv.tar.gz ]; then
19 |   venv-pack -o pyspark_venv.tar.gz
20 | fi
21 | 
22 | 
23 | # Set in local and client mode where the driver uses the Python present
24 | # (requires that you have activated the venv as we did above)
25 | PYSPARK_DRIVER_PYTHON=python
26 | export PYSPARK_DRIVER_PYTHON
27 | export PYTHON_PATH=./environment/bin/python
28 | #end::package_venv[]
29 | 
30 | # Some hack for our json magic
31 | cat se*.json > spark_expectations_sample_rules.json
32 | 
33 | function check_fail () {
34 |   local ex="$1"
35 |   local code="$2"
36 |   if [ -f "${ex}.fail" ]; then
37 |     echo "ok";
38 |   else
39 |     exit "$code"
40 |   fi
41 | }
42 | 
43 | EXAMPLE_JAR="./core/target/scala-2.13/core-assembly-0.1.0-SNAPSHOT.jar"
44 | 
45 | pip install setuptools
46 | 
47 | # Iceberg JAR not yet available for Spark 4.
48 | if [ ! -f "${EXAMPLE_JAR}" ]; then
49 |   rm ./core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala # temp hack no merge in Spark 3.
50 |   sbt core/assembly -DsparkVersion="${SPARK_VERSION}"
51 | fi
52 | 
53 | if [ ! -f "${EXAMPLE_JAR}" ]; then
54 |   echo "Can't find sample jar?!?"
55 |   exit 1
56 | fi
57 | 
58 | function run_example () {
59 |   local ex="$1"
60 |   # shellcheck disable=SC2046
61 |   spark-submit \
62 | 	       --master local[5] \
63 | 	       --conf spark.eventLog.enabled=true \
64 | 	       --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \
65 | 	       --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \
66 | 	       --conf spark.sql.catalog.spark_catalog.type=hive \
67 | 	       --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \
68 | 	       --conf spark.sql.catalog.local.type=hadoop \
69 | 	       --archives pyspark_venv.tar.gz#environment \
70 | 	       --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \
71 | 	       $(cat "${ex}.conf" || echo "") \
72 | 	       --name "${ex}" \
73 | 	       --jars "${EXAMPLE_JAR}" \
74 | 	       "${ex}" 2>&1 | tee -a "${ex}.out" || check_fail "$ex" $?
75 | }
76 | 
77 | if [ $# -eq 1 ]; then
78 |   run_example "python/examples/$1"
79 | else
80 |   for ex in python/examples/*.py; do
81 |     if [[ "$ex" =~ test.* ]]; then
82 |       echo "Skipping ex $ex as it is a test and covered by our tests."
83 |     else
84 |       echo "Running $ex"
85 |       run_example "$ex"
86 |     fi
87 |   done
88 | fi
89 | 


--------------------------------------------------------------------------------
/native/src/c/gluten/GlutenUDF.cpp:
--------------------------------------------------------------------------------
 1 | // Filename MyUDF.cpp
 2 | 
 3 | #include <velox/expression/VectorFunction.h>
 4 | #include <velox/udf/Udf.h>
 5 | #include <iostream>
 6 | 
 7 | 
 8 | namespace {
 9 | using namespace facebook::velox;
10 | 
11 | template <TypeKind Kind>
12 | class PlusConstantFunction : public exec::VectorFunction {
13 |  public:
14 |   explicit PlusConstantFunction(int32_t addition) : addition_(addition) {}
15 | 
16 |   void apply(
17 |       const SelectivityVector& rows,
18 |       std::vector<VectorPtr>& args,
19 |       const TypePtr& /* outputType */,
20 |       exec::EvalCtx& context,
21 |       VectorPtr& result) const override {
22 |     using nativeType = typename TypeTraits<Kind>::NativeType;
23 |     VELOX_CHECK_EQ(args.size(), 1);
24 | 
25 |     auto& arg = args[0];
26 | 
27 |     // The argument may be flat or constant.
28 |     VELOX_CHECK(arg->isFlatEncoding() || arg->isConstantEncoding());
29 | 
30 |     BaseVector::ensureWritable(rows, createScalarType<Kind>(), context.pool(), result);
31 | 
32 |     auto* flatResult = result->asFlatVector<nativeType>();
33 |     auto* rawResult = flatResult->mutableRawValues();
34 | 
35 |     flatResult->clearNulls(rows);
36 | 
37 |     if (arg->isConstantEncoding()) {
38 |       auto value = arg->as<ConstantVector<nativeType>>()->valueAt(0);
39 |       rows.applyToSelected([&](auto row) { rawResult[row] = value + addition_; });
40 |     } else {
41 |       auto* rawInput = arg->as<FlatVector<nativeType>>()->rawValues();
42 | 
43 |       rows.applyToSelected([&](auto row) { rawResult[row] = rawInput[row] + addition_; });
44 |     }
45 |   }
46 | 
47 |  private:
48 |   const int32_t addition_;
49 | };
50 | 
51 | static std::vector<std::shared_ptr<exec::FunctionSignature>> integerSignatures() {
52 |   // integer -> integer
53 |   return {exec::FunctionSignatureBuilder().returnType("integer").argumentType("integer").build()};
54 | }
55 | 
56 | static std::vector<std::shared_ptr<exec::FunctionSignature>> bigintSignatures() {
57 |   // bigint -> bigint
58 |   return {exec::FunctionSignatureBuilder().returnType("bigint").argumentType("bigint").build()};
59 | }
60 | 
61 | } // namespace
62 | 
63 | const int kNumMyUdf = 2;
64 | gluten::UdfEntry myUdf[kNumMyUdf] = {{"myudf1", "integer"}, {"myudf2", "bigint"}};
65 | 
66 | DEFINE_GET_NUM_UDF {
67 |   return kNumMyUdf;
68 | }
69 | 
70 | DEFINE_GET_UDF_ENTRIES {
71 |   for (auto i = 0; i < kNumMyUdf; ++i) {
72 |     udfEntries[i] = myUdf[i];
73 |   }
74 | }
75 | 
76 | DEFINE_REGISTER_UDF {
77 |   facebook::velox::exec::registerVectorFunction(
78 |       "myudf1", integerSignatures(), std::make_unique<PlusConstantFunction<facebook::velox::TypeKind::INTEGER>>(5));
79 |   facebook::velox::exec::registerVectorFunction(
80 |       "myudf2", bigintSignatures(), std::make_unique<PlusConstantFunction<facebook::velox::TypeKind::BIGINT>>(5));
81 |   std::cout << "registered myudf1, myudf2" << std::endl;
82 | }
83 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/streaming/DStream.scala:
--------------------------------------------------------------------------------
 1 | /**
 2 |  * Streaming Pandas Example with the old DStream APIs.
 3 |  */
 4 | package com.highperformancespark.examples.streaming
 5 | 
 6 | import scala.reflect.ClassTag
 7 | 
 8 | import org.apache.spark._
 9 | import org.apache.spark.rdd.RDD
10 | import org.apache.spark.streaming._
11 | import org.apache.spark.streaming.dstream._
12 | 
13 | import org.apache.hadoop.io.LongWritable
14 | import org.apache.hadoop.io.Text
15 | import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
16 | //end::DStreamImports[]
17 | 
18 | object DStreamExamples {
19 |   def makeStreamingContext(sc: SparkContext) = {
20 |     //tag::ssc[]
21 |     val batchInterval = Seconds(1)
22 |     new StreamingContext(sc, batchInterval)
23 |     //end::ssc[]
24 |   }
25 | 
26 |   def makeRecoverableStreamingContext(sc: SparkContext, checkpointDir: String) = {
27 |     //tag::sscRecover[]
28 |     def createStreamingContext(): StreamingContext = {
29 |       val batchInterval = Seconds(1)
30 |       val ssc = new StreamingContext(sc, batchInterval)
31 |       ssc.checkpoint(checkpointDir)
32 |       // Then create whatever stream is required
33 |       // And whatever mappings need to go on those streams
34 |       ssc
35 |     }
36 |     val ssc = StreamingContext.getOrCreate(checkpointDir,
37 |       createStreamingContext _)
38 |     // Do whatever work needs to be done regardless of state
39 |     // Start context and run
40 |     ssc.start()
41 |     //end::sscRecover[]
42 |   }
43 | 
44 |   def fileAPIExample(ssc: StreamingContext, path: String):
45 |       DStream[(Long, String)] = {
46 |     //tag::file[]
47 |     // You don't need to write the types of the InputDStream but it for illustration
48 |     val inputDStream: InputDStream[(LongWritable, Text)] =
49 |       ssc.fileStream[LongWritable, Text, TextInputFormat](path)
50 |     // Convert the hadoop types to native JVM types for simplicity
51 |     def convert(input: (LongWritable, Text)) = {
52 |       (input._1.get(), input._2.toString())
53 |     }
54 |     val input: DStream[(Long, String)] = inputDStream.map(convert)
55 |     //end::file[]
56 |     input
57 |   }
58 | 
59 |   def repartition(dstream: DStream[_]) = {
60 |     //tag::repartition[]
61 |     dstream.repartition(20)
62 |     //end::repartition[]
63 |   }
64 | 
65 |   //tag::repartitionWithTransform[]
66 |   def dStreamRepartition[A: ClassTag](dstream: DStream[A]): DStream[A] = {
67 |     dstream.transform{rdd => rdd.repartition(20)}
68 |   }
69 |   //end::repartitionWithTransform[]
70 | 
71 |   def simpleTextOut(target: String, dstream: DStream[_]) = {
72 |     //tag::simpleOut[]
73 |     dstream.saveAsTextFiles(target)
74 |     //end::simpleOut[]
75 |   }
76 | 
77 |   def foreachSaveSequence(target: String, dstream: DStream[(Long, String)]) = {
78 |     //tag::foreachSave[]
79 |     dstream.foreachRDD{(rdd, window) =>
80 |       rdd.saveAsSequenceFile(target + window)
81 |     }
82 |     //end::foreachSave[]
83 |   }
84 | }
85 | 


--------------------------------------------------------------------------------
/core/src/main/perl/lib/HighPerformanceSpark/Examples.pm:
--------------------------------------------------------------------------------
  1 | package HighPerformanceSpark::Examples;
  2 | 
  3 | use 5.006;
  4 | use strict;
  5 | use warnings;
  6 | 
  7 | =head1 NAME
  8 | 
  9 | HighPerformanceSpark::Examples - The great new HighPerformanceSpark::Examples!
 10 | 
 11 | =head1 VERSION
 12 | 
 13 | Version 0.01
 14 | 
 15 | =cut
 16 | 
 17 | our $VERSION = '0.01';
 18 | 
 19 | 
 20 | =head1 SYNOPSIS
 21 | 
 22 | Quick summary of what the module does.
 23 | 
 24 | Perhaps a little code snippet.
 25 | 
 26 |     use HighPerformanceSpark::Examples;
 27 | 
 28 |     my $foo = HighPerformanceSpark::Examples->new();
 29 |     ...
 30 | 
 31 | =head1 EXPORT
 32 | 
 33 | A list of functions that can be exported.  You can delete this section
 34 | if you don't export anything, such as for a purely object-oriented module.
 35 | 
 36 | =head1 SUBROUTINES/METHODS
 37 | 
 38 | =head2 function1
 39 | 
 40 | =cut
 41 | 
 42 | sub function1 {
 43 | }
 44 | 
 45 | =head2 function2
 46 | 
 47 | =cut
 48 | 
 49 | sub function2 {
 50 | }
 51 | 
 52 | =head1 AUTHOR
 53 | 
 54 | Holden Karau And Rachel Warren, C<< <high-performance-spark at googlegroups.com> >>
 55 | 
 56 | =head1 BUGS
 57 | 
 58 | Please report any bugs or feature requests to C<bug-highperformancespark-examples at rt.cpan.org>, or through
 59 | the web interface at L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=HighPerformanceSpark-Examples>.  I will be notified, and then you'll
 60 | automatically be notified of progress on your bug as I make changes.
 61 | 
 62 | 
 63 | 
 64 | 
 65 | =head1 SUPPORT
 66 | 
 67 | You can find documentation for this module with the perldoc command.
 68 | 
 69 |     perldoc HighPerformanceSpark::Examples
 70 | 
 71 | 
 72 | You can also look for information at:
 73 | 
 74 | =over 4
 75 | 
 76 | =item * RT: CPAN's request tracker (report bugs here)
 77 | 
 78 | L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=HighPerformanceSpark-Examples>
 79 | 
 80 | =item * AnnoCPAN: Annotated CPAN documentation
 81 | 
 82 | L<http://annocpan.org/dist/HighPerformanceSpark-Examples>
 83 | 
 84 | =item * CPAN Ratings
 85 | 
 86 | L<http://cpanratings.perl.org/d/HighPerformanceSpark-Examples>
 87 | 
 88 | =item * Search CPAN
 89 | 
 90 | L<http://search.cpan.org/dist/HighPerformanceSpark-Examples/>
 91 | 
 92 | =back
 93 | 
 94 | 
 95 | =head1 ACKNOWLEDGEMENTS
 96 | 
 97 | 
 98 | =head1 LICENSE AND COPYRIGHT
 99 | 
100 | Copyright 2016 Holden Karau And Rachel Warren.
101 | 
102 | Licensed under the Apache License, Version 2.0 (the "License");
103 | you may not use this file except in compliance with the License.
104 | You may obtain a copy of the License at
105 | 
106 |     L<http://www.apache.org/licenses/LICENSE-2.0>
107 | 
108 | Unless required by applicable law or agreed to in writing, software
109 | distributed under the License is distributed on an "AS IS" BASIS,
110 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
111 | See the License for the specific language governing permissions and
112 | limitations under the License.
113 | 
114 | 
115 | =cut
116 | 
117 | 1; # End of HighPerformanceSpark::Examples
118 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/goldilocks/SortingTests.scala:
--------------------------------------------------------------------------------
 1 | 
 2 | package com.highperformancespark.examples.goldilocks
 3 | 
 4 | import scala.reflect.ClassTag
 5 | 
 6 | import org.apache.spark.rdd.RDD
 7 | 
 8 | import com.holdenkarau.spark.testing.SharedSparkContext
 9 | import org.scalatest.funsuite.AnyFunSuite
10 | 
11 | 
12 | class SortingTests extends AnyFunSuite with SharedSparkContext {
13 | 
14 |   test("Test Sort by two keys"){
15 | 
16 |     val sortedData: Array[((Int, Char), Double)] = Range(0, 15).flatMap(  x =>
17 |        Range(50, 100).map(i => (( x, i.toChar), Math.random()))
18 |     ).toArray
19 | 
20 |     val unsorted = scramble(sc.parallelize(sortedData),2)
21 |     val sortedSimple: Array[((Int, Char), Double)]  = unsorted.sortByKey().collect()
22 | 
23 |     assert(sortedSimple sameElements sortedData)
24 |     }
25 | 
26 |   test("Panda Secondary Sort"){
27 |     val pandaData: Array[(String, StreetAddress, Int, Double)] = Array(
28 |       ("Morris", StreetAddress("Accra","Grove", 52 ), 84440, 0.0),
29 |       ("Joe", StreetAddress("Accra","Grove", 52 ), 94440, 0.0),
30 |       ("Kobe", StreetAddress("Accra","Grove", 52 ), 94440, 0.0),
31 | 
32 |       ("Morris", StreetAddress("Albany","Grove", 52 ), 84440, 0.0),
33 |       ("Joe", StreetAddress("Albany","Grove", 52 ), 94440, 0.0),
34 |       ("Kobe", StreetAddress("Albany","Grove", 52 ), 94440, 0.5),
35 |       ("Morris", StreetAddress("Denver","Grove", 52 ), 84440, 0.5),
36 |       ("Joe", StreetAddress("LA","Grove", 52 ), 94440, 0.5),
37 |       ("Kobe", StreetAddress("LA","Grove", 52 ), 94440, 0.5),
38 |       ("Joe", StreetAddress("SanFransisco","Grove", 52 ), 94440, 0.5),
39 |       ("Kobe", StreetAddress("SanFransisco","Grove", 52 ), 94440, 0.5),
40 |       ("Joe", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5),
41 |       ("Kobe", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5),
42 |       ("Lacy", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5),
43 |       ("Morris", StreetAddress("Seattle","Grove", 52 ), 84440, 0.5),
44 |       ("Joe", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5),
45 |         ("Kobe", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5),
46 |         ("Lacy", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5),
47 |         ("Morris", StreetAddress("Seattle","Grove", 52 ), 94440, 0.5),
48 |       ("Joe", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5),
49 |       ("Kobe", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5),
50 |       ("Lacy", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5),
51 |       ("Morris", StreetAddress("Tacoma","Grove", 52 ), 94440, 0.5)
52 |     )
53 | 
54 |     val unsorted = scramble(sc.parallelize(pandaData))
55 |     val pandaSort = PandaSecondarySort.secondarySort(unsorted)
56 |     pandaSort.zipWithIndex().collect.foreach{
57 |       case (x, i) => assert(x == pandaData(i.toInt), "Element " + x + " is wrong")
58 |     }
59 | 
60 | 
61 | 
62 |   }
63 | 
64 | 
65 |   def scramble[T : ClassTag]( rdd : RDD[T], partitions : Int= 3) = {
66 |     val wRandom = rdd.map((Math.random(), _))
67 |     val unsorted = wRandom.sortByKey(true, partitions)
68 |     unsorted.values
69 |   }
70 | 
71 | }
72 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/tools/GenerateScalingData.scala:
--------------------------------------------------------------------------------
 1 | package com.highperformancespark.examples.tools
 2 | 
 3 | import org.apache.spark._
 4 | import org.apache.spark.mllib.linalg.Vector
 5 | import org.apache.spark.mllib.random.RandomRDDs
 6 | import org.apache.spark.rdd.RDD
 7 | import org.apache.spark.sql.Row
 8 | 
 9 | import com.highperformancespark.examples.dataframe.RawPanda
10 | 
11 | object GenerateScalingData {
12 |   /**
13 |    * Generate a Goldilocks data set. We expect the zip code to follow an exponential
14 |    * distribution and the data its self to be normal
15 |    *
16 |    * Note: May generate less than number of requested rows due to different
17 |    * distribution between
18 |    *
19 |    * partitions and zip being computed per partition.
20 |    * @param rows number of rows in the RDD (approximate)
21 |    * @param size number of value elements
22 |    */
23 |   def generateFullGoldilocks(sc: SparkContext, rows: Long, numCols: Int):
24 |       RDD[RawPanda] = {
25 |     val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000,  size = rows)
26 |       .map(_.toInt.toString)
27 |     val valuesRDD = RandomRDDs.normalVectorRDD(
28 |       sc, numRows = rows, numCols = numCols)
29 |       .repartition(zipRDD.partitions.size)
30 |     val keyRDD = sc.parallelize(1L.to(rows), zipRDD.getNumPartitions)
31 |     keyRDD.zipPartitions(zipRDD, valuesRDD){
32 |       (i1, i2, i3) =>
33 |       new Iterator[(Long, String, Vector)] {
34 |         def hasNext: Boolean = (i1.hasNext, i2.hasNext, i3.hasNext) match {
35 |           case (true, true, true) => true
36 |           case (false, false, false) => false
37 |             // Note: this is "unsafe" (we throw away data when one of
38 |             // the partitions has run out).
39 |           case _ => false
40 |         }
41 |         def next(): (Long, String, Vector) = (i1.next(), i2.next(), i3.next())
42 |       }
43 |     }.map{case (k, z, v) =>
44 |       RawPanda(k, z, "giant", v(0) > 0.5, v.toArray)}
45 |   }
46 | 
47 |   /**
48 |    * Transform it down to just the data used for the benchmark
49 |    */
50 |   def generateMiniScale(sc: SparkContext, rows: Long, numCols: Int):
51 |       RDD[(Int, Double)] = {
52 |     generateFullGoldilocks(sc, rows, numCols)
53 |       .map(p => (p.zip.toInt, p.attributes(0)))
54 |   }
55 | 
56 |   /**
57 |    * Transform it down to just the data used for the benchmark
58 |    */
59 |   def generateMiniScaleRows(sc: SparkContext, rows: Long, numCols: Int):
60 |       RDD[Row] = {
61 |     generateMiniScale(sc, rows, numCols).map{case (zip, fuzzy) => Row(zip, fuzzy)}
62 |   }
63 | 
64 |   // tag::MAGIC_PANDA[]
65 |   /**
66 |    * Generate a Goldilocks data set all with the same id.
67 |    * We expect the zip code to follow an exponential
68 |    * distribution and the data its self to be normal.
69 |    * Simplified to avoid a 3-way zip.
70 |    *
71 |    * Note: May generate less than number of requested rows due to
72 |    * different distribution between partitions and zip being computed
73 |    * per partition.
74 |    */
75 |   def generateGoldilocks(sc: SparkContext, rows: Long, numCols: Int):
76 |       RDD[RawPanda] = {
77 |     val zipRDD = RandomRDDs.exponentialRDD(sc, mean = 1000,  size = rows)
78 |       .map(_.toInt.toString)
79 |     val valuesRDD = RandomRDDs.normalVectorRDD(
80 |       sc, numRows = rows, numCols = numCols)
81 |     zipRDD.zip(valuesRDD).map{case (z, v) =>
82 |       RawPanda(1, z, "giant", v(0) > 0.5, v.toArray)
83 |     }
84 |   }
85 |   // end::MAGIC_PANDA[]
86 | }
87 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/goldilocks/EvaluationTests.scala:
--------------------------------------------------------------------------------
  1 | package com.highperformancespark.examples.goldilocks
  2 | 
  3 | import org.apache.spark.rdd.RDD
  4 | 
  5 | import com.holdenkarau.spark.testing.SharedSparkContext
  6 | import org.scalatest.funsuite.AnyFunSuite
  7 | 
  8 | class EvaluationTests extends AnyFunSuite with SharedSparkContext {
  9 |   val doubleList = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0)
 10 |   val keyValuePairs = Array(1.0, 2.0, 3.0, 4.0, 1.0, 2.0, 3.0, 4.0).zipWithIndex
 11 |   val path = "target/testResults"
 12 | 
 13 |   test("MapValues preserves Partitioning "){
 14 |     val data: RDD[(Double, Int )] = sc.parallelize(keyValuePairs)
 15 |     // tag::MapValues[]
 16 |     val sortedData = data.sortByKey()
 17 |     val mapValues: RDD[(Double, String)] = sortedData.mapValues(_.toString)
 18 |     assert(mapValues.partitioner.isDefined,
 19 |       "Using Map Values preserves partitioning")
 20 | 
 21 |     val map = sortedData.map( pair => (pair._1, pair._2.toString))
 22 |     assert(map.partitioner.isEmpty, "Using map does not preserve partitioning")
 23 |     // end::MapValues[]
 24 |   }
 25 | 
 26 |   test( "Subtract Behavior "){
 27 |     // tag::Subtract[]
 28 |     val a = Array(1, 2, 3, 4, 4, 4, 4)
 29 |     val b = Array(3, 4)
 30 |     val rddA = sc.parallelize(a)
 31 |     val rddB = sc.parallelize(b)
 32 |     val rddC = rddA.subtract(rddB)
 33 |     assert(rddC.count() < rddA.count() - rddB.count())
 34 |     // end::Subtract[]
 35 |   }
 36 | 
 37 |   test( "Intersection Behavior "){
 38 |     // tag::Intersect[]
 39 |     val a = Array(1, 2, 3, 4, 4, 4, 4)
 40 |     val b = Array(3, 4)
 41 |     val rddA = sc.parallelize(a)
 42 |     val rddB = sc.parallelize(b)
 43 |     val intersection = rddA.intersection(rddB)
 44 |     val subtraction = rddA.subtract(rddB)
 45 |     val union = intersection.union(subtraction)
 46 |     assert(!rddA.collect().sorted.sameElements(union.collect().sorted))
 47 |     // end::Intersect[]
 48 |   }
 49 | 
 50 |   test("Itereative Computations "){
 51 |     def rmse(rdd : RDD[(Int, Int )]) = {
 52 |       val n = rdd.count()
 53 |       math.sqrt(rdd.map(x => (x._1 - x._2) * (x._1 - x._2)).reduce(_ + _) / n)
 54 |     }
 55 | 
 56 |     val validationSet = sc.parallelize(keyValuePairs)
 57 | 
 58 |     // tag::iterativeComp[]
 59 |     val testSet: Array[RDD[(Double, Int)]] =
 60 |       Array(
 61 |         validationSet.mapValues(_ + 1),
 62 |         validationSet.mapValues(_ + 2),
 63 |         validationSet)
 64 |     validationSet.persist() //persist since we are using this RDD several times
 65 |     val errors = testSet.map( rdd => {
 66 |         rmse(rdd.join(validationSet).values)
 67 |     })
 68 |     // end::iterativeComp[]
 69 | 
 70 |     // the one where we didn't change anything should have the
 71 |     // lowest root mean squared error
 72 |     assert(errors.min == errors(2))
 73 | 
 74 |   }
 75 | 
 76 |   test( "Two actions without caching ") {
 77 |     val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs)
 78 | 
 79 |     // tag::TwoActions[]
 80 |     val sorted = rddA.sortByKey()
 81 |     val count = sorted.count() // sorted Action 1
 82 |     val sample: Long = count / 10
 83 |     val sampled = sorted.take(sample.toInt) // sorted Action 2
 84 |     // end::TwoActions[]
 85 |   }
 86 | 
 87 |   test( "Two actions with caching  "){
 88 |     val rddA: RDD[(Double, Int)] = sc.parallelize(keyValuePairs)
 89 |     // tag::TwoActionsCache[]
 90 |     val sorted = rddA.sortByKey()
 91 |     sorted.persist()
 92 |     val count = sorted.count() // sorted Action 1
 93 |     val sample: Long = count / 10
 94 |     val sampled = sorted.take(sample.toInt) // sorted Action 2
 95 |     // end::TwoActionsCache[]
 96 |   }
 97 | 
 98 | 
 99 | 
100 | }
101 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/perf/SimplePerfTest.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Licensed to the Apache Software Foundation (ASF) under one or more
 3 |  * contributor license agreements.  See the NOTICE file distributed with
 4 |  * this work for additional information regarding copyright ownership.
 5 |  * The ASF licenses this file to You under the Apache License, Version 2.0
 6 |  * (the "License"); you may not use this file except in compliance with
 7 |  * the License.  You may obtain a copy of the License at
 8 |  *
 9 |  *    http://www.apache.org/licenses/LICENSE-2.0
10 |  *
11 |  * Unless required by applicable law or agreed to in writing, software
12 |  * distributed under the License is distributed on an "AS IS" BASIS,
13 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 |  * See the License for the specific language governing permissions and
15 |  * limitations under the License.
16 |  */
17 | package com.highperformancespark.examples.perf
18 | 
19 | import org.apache.spark.SparkConf
20 | import org.apache.spark.SparkContext
21 | import org.apache.spark.rdd._
22 | import org.apache.spark.sql.DataFrame
23 | import org.apache.spark.sql.Dataset
24 | import org.apache.spark.sql.Row
25 | import org.apache.spark.sql.SparkSession
26 | import org.apache.spark.sql.types._
27 | 
28 | import com.highperformancespark.examples.dataframe.RawPanda
29 | import com.highperformancespark.examples.tools._
30 | 
31 | /**
32 |  * A simple performance test to compare a simple sort between DataFrame, and RDD
33 |  */
34 | object SimplePerfTest {
35 |   // $COVERAGE-OFF$
36 |   def main(args: Array[String]) = {
37 |     val sparkConf = new SparkConf().setAppName("simple-perf-test")
38 |     val sparkSession = SparkSession.builder().enableHiveSupport().getOrCreate()
39 |     val sc = sparkSession.sparkContext
40 |     val scalingFactor = if (args.length > 0) args(0).toLong else 100L
41 |     val size = if (args.length > 1) args(1).toInt else 50
42 |     run(sc, sparkSession, scalingFactor, size)
43 |   }
44 | 
45 |   def run(sc: SparkContext, session: SparkSession,
46 |     scalingFactor: Long, size: Int) = {
47 |     import session.implicits._
48 |     val inputRDD = GenerateScalingData.generateFullGoldilocks(
49 |       sc, scalingFactor, size)
50 |     val pairRDD = inputRDD.map(p => (p.zip.toInt, p.attributes(0)))
51 |     pairRDD.cache()
52 |     pairRDD.count()
53 |     val rddTimeings = 1.to(10).map(x => time(testOnRDD(pairRDD)))
54 |     val groupTimeings = 1.to(10).map(x => time(groupOnRDD(pairRDD)))
55 |     val df = inputRDD.toDF()
56 |     val inputDataFrame = df.select(
57 |       df("zip").cast(IntegerType),
58 |       df("attributes")(0).as("fuzzyness").cast(DoubleType))
59 |     inputDataFrame.cache()
60 |     inputDataFrame.count()
61 |     val dataFrameTimeings = 1.to(10).map(x => time(testOnDataFrame(inputDataFrame)))
62 |     println(rddTimeings.map(_._2).mkString(","))
63 |     println(groupTimeings.map(_._2).mkString(","))
64 |     println(dataFrameTimeings.map(_._2).mkString(","))
65 |   }
66 | 
67 |   def testOnRDD(rdd: RDD[(Int, Double)]): Long = {
68 |     val kvc: RDD[(Int, (Double , Int))] = rdd.map{case (x, y) => (x, (y, 1))}
69 |     kvc.reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2)).count()
70 |   }
71 | 
72 |   def groupOnRDD(rdd: RDD[(Int, Double)]) = {
73 |     rdd.groupByKey().mapValues{v =>
74 |       v.aggregate((0.0, 0))({case (x, y) => (x._1 + y, x._2 + 1)},
75 |         {case (x, y) => (x._1 + y._1, x._2 + y._2)})}.count()
76 |   }
77 | 
78 |   def testOnDataFrame(df: DataFrame) = {
79 |     df.groupBy("zip").avg("fuzzyness").count()
80 |   }
81 | 
82 |   def time[R](block: => R): (R, Long) = {
83 |     val t0 = System.nanoTime()
84 |     val result = block    // call-by-name
85 |     val t1 = System.nanoTime()
86 |     println(s"Time ${t1 - t0}ns")
87 |     (result, t1 - t0)
88 |   }
89 |   // $COVERAGE-ON$
90 | }
91 | 


--------------------------------------------------------------------------------
/Dockerfile-mini:
--------------------------------------------------------------------------------
 1 | # Open JDK11, Spark 3.X and the latest JDKs get a little spicy
 2 | FROM azul/zulu-openjdk:11-latest
 3 | 
 4 | RUN apt-get -qq update && \
 5 |     apt-get -qq -y upgrade && \
 6 |     apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \
 7 |     locale-gen en_US.UTF-8 && \
 8 |     apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
 9 |     echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
10 |     echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
11 |     curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
12 |     chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
13 |     apt-get update && \
14 |     apt-get -qq -y install sbt && \
15 |     rm -rf /var/lib/apt/lists/*
16 | 
17 | RUN curl -Lo coursier https://git.io/coursier-cli
18 | RUN chmod +x coursier
19 | # ensure the JAR of the CLI is in the coursier cache, in the image
20 | RUN ./coursier --help
21 | RUN pip install --no-cache-dir  jupyter
22 | # Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
23 | #RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
24 | RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb
25 | 
26 | RUN ./coursier bootstrap \
27 |       -r jitpack \
28 |       -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
29 |       sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
30 |       --default=true --sources \
31 |       -o almond && \
32 |     ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
33 | 
34 | 
35 | RUN adduser dev
36 | RUN adduser dev sudo
37 | RUN echo 'dev:dev' | chpasswd 
38 | RUN mkdir -p ~dev
39 | RUN cp ./coursier ~dev/
40 | RUN echo "color_prompt=yes" >> ~dev/.bashrc
41 | RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
42 | RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc
43 | RUN chown -R dev ~dev
44 | USER dev
45 | # Kernels are installed in user so we need to run as the user
46 | RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
47 | USER root
48 | 
49 | RUN mkdir -p /high-performance-spark-examples
50 | RUN mkdir -p /high-performance-spark-examples/warehouse
51 | RUN chown -R dev /high-performance-spark-examples
52 | WORKDIR /high-performance-spark-examples
53 | # Increase the chance of caching by copying just the env setup file first.
54 | COPY --chown=dev:dev env_setup.sh ./
55 | # Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
56 | # Also downloads some test data
57 | RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz
58 | RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
59 | # Note: We need to use /home in the COPY otherwise no happy pandas
60 | COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
61 | RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
62 | RUN chown -R dev /high-performance-spark-examples
63 | ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
64 | RUN git clone https://github.com/holdenk/spark-upgrade.git
65 | RUN chown -R dev /high-performance-spark-examples
66 | USER dev
67 | RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
68 | CMD ["/high-performance-spark-examples/misc/container_launch.sh"]
69 | 
70 | 


--------------------------------------------------------------------------------
/python/examples/spark_expectations_example.py:
--------------------------------------------------------------------------------
  1 | from pyspark import SparkFiles
  2 | from pyspark.sql import *
  3 | from spark_expectations.core.expectations import (
  4 |     SparkExpectations,
  5 |     WrappedDataFrameWriter,
  6 | )
  7 | 
  8 | spark = SparkSession.builder.master("local[4]").getOrCreate()
  9 | sc = spark.sparkContext
 10 | sc.setLogLevel("ERROR")
 11 | 
 12 | # tag::global_setup[]
 13 | se_conf = {
 14 |     "se_notifications_enable_email": False,
 15 |     "se_notifications_email_smtp_host": "mailhost.example.com",
 16 |     "se_notifications_email_smtp_port": 25,
 17 |     "se_notifications_email_from": "timbit@example.com",
 18 |     "se_notifications_email_subject": "spark expectations - data quality - notifications",
 19 |     "se_notifications_on_fail": True,
 20 |     "se_notifications_on_error_drop_exceeds_threshold_breach": True,
 21 |     "se_notifications_on_error_drop_threshold": 15,
 22 | }
 23 | # end::global_setup[]
 24 | 
 25 | 
 26 | # tag::setup_and_load[]
 27 | from spark_expectations.config.user_config import Constants as user_config
 28 | 
 29 | spark.sql("DROP TABLE IF EXISTS local.magic_validation")
 30 | spark.sql(
 31 |     """
 32 | create table local.magic_validation (
 33 |     product_id STRING,
 34 |     table_name STRING,
 35 |     rule_type STRING,
 36 |     rule STRING,
 37 |     column_name STRING,
 38 |     expectation STRING,
 39 |     action_if_failed STRING,
 40 |     tag STRING,
 41 |     description STRING,
 42 |     enable_for_source_dq_validation BOOLEAN,
 43 |     enable_for_target_dq_validation BOOLEAN,
 44 |     is_active BOOLEAN,
 45 |     enable_error_drop_alert BOOLEAN,
 46 |     error_drop_threshold INT
 47 | )"""
 48 | )
 49 | # Reminder: addFile does not handle directories well.
 50 | rule_file = "spark_expectations_sample_rules.json"
 51 | sc.addFile(rule_file)
 52 | df = spark.read.json(SparkFiles.get(rule_file))
 53 | print(df)
 54 | df.write.option("byname", "true").mode("append").saveAsTable("local.magic_validation")
 55 | spark.read.table("local.magic_validation").show()
 56 | 
 57 | # Can be used to point to your desired metastore.
 58 | se_writer = WrappedDataFrameWriter().mode("append").format("iceberg")
 59 | 
 60 | rule_df = spark.sql("select * from local.magic_validation")
 61 | 
 62 | se: SparkExpectations = SparkExpectations(
 63 |     rules_df=rule_df,  # See if we can replace this with the DF we wrote out.
 64 |     product_id="pay",  # We will only apply rules matching this product id
 65 |     stats_table="local.dq_stats",
 66 |     stats_table_writer=se_writer,
 67 |     target_and_error_table_writer=se_writer,
 68 |     stats_streaming_options={user_config.se_enable_streaming: False},
 69 | )
 70 | # end::setup_and_load[]
 71 | rule_df.show(truncate=200)
 72 | 
 73 | 
 74 | # tag::run_validation_row[]
 75 | @se.with_expectations(
 76 |     user_conf=se_conf,
 77 |     write_to_table=False,  # If set to true SE will write to the target table.
 78 |     target_and_error_table_writer=se_writer,
 79 |     # target_table is used to create the error table (e.g. here local.fake_table_name_error)
 80 |     # and filter the rules on top of the global product filter.
 81 |     target_table="local.fake_table_name",
 82 | )
 83 | def load_data():
 84 |     raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
 85 |     uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent")
 86 |     return uk_df
 87 | 
 88 | 
 89 | # data = load_data()
 90 | # end::run_validation_row[]
 91 | 
 92 | 
 93 | # tag::run_validation_complex[]
 94 | @se.with_expectations(
 95 |     user_conf=se_conf,
 96 |     write_to_table=True,  # If set to true SE will write to the target table.
 97 |     target_and_error_table_writer=se_writer,
 98 |     # target_table is used to create the error table (e.g. here local.fake_table_name_error)
 99 |     # and filter the rules on top of the global product filter.
100 |     target_table="local.3rd_fake",
101 | )
102 | def load_data2():
103 |     raw_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True)
104 |     uk_df = raw_df.select("CompanyNumber", "MaleBonusPercent", "FemaleBonuspercent")
105 |     return uk_df
106 | 
107 | 
108 | data = load_data2()
109 | # end::run_validation_complex[]
110 | 
111 | spark.sql("SELECT * FROM local.3rd_fake_error").show(truncate=300)
112 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/transformations/Accumulators.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Illustrates how to use Spark accumulators. Note that most of these examples
  3 |  * are "dangerous" in that they may not return consistent results.
  4 |  */
  5 | package com.highperformancespark.examples.transformations
  6 | 
  7 | import java.{lang => jl}
  8 | 
  9 | import scala.collection.mutable.HashSet
 10 | 
 11 | import org.apache.spark._
 12 | import org.apache.spark.rdd._
 13 | import org.apache.spark.util.AccumulatorV2
 14 | 
 15 | import com.highperformancespark.examples.dataframe.RawPanda
 16 | object Accumulators {
 17 |   /**
 18 |    * Compute the total fuzzyness with an accumulator while generating
 19 |    * an id and zip pair for sorting.
 20 |    */
 21 |   //tag::sumFuzzyAcc[]
 22 |   def computeTotalFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]):
 23 |       (RDD[(String, Long)], Double) = {
 24 |     // Create an accumulator with the initial value of 0.0
 25 |     val acc = sc.doubleAccumulator
 26 |     val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)}
 27 |     // accumulator still has zero value
 28 |     // Note: This example is dangerous since the transformation may be
 29 |     // evaluated multiple times.
 30 |     transformed.count() // force evaluation
 31 |     (transformed, acc.value)
 32 |   }
 33 |   //end::sumFuzzyAcc[]
 34 | 
 35 |   /**
 36 |    * Compute the max fuzzyness with an accumulator while generating an
 37 |    * id and zip pair for sorting.
 38 |    */
 39 |   //tag::maxFuzzyAcc[]
 40 |   def computeMaxFuzzyNess(sc: SparkContext, rdd: RDD[RawPanda]):
 41 |       (RDD[(String, Long)], Double) = {
 42 |     class MaxDoubleParam extends AccumulatorV2[jl.Double, jl.Double] {
 43 |       var _value = Double.MinValue
 44 |       override def isZero(): Boolean = {
 45 |         _value == Double.MinValue
 46 |       }
 47 |       override def reset() = {
 48 |         _value = Double.MinValue
 49 |       }
 50 | 
 51 |       override def add(r1: jl.Double): Unit = {
 52 |         _value = Math.max(r1, _value)
 53 |       }
 54 | 
 55 |       def add(r1: Double): Unit = {
 56 |         _value = Math.max(r1, _value)
 57 |       }
 58 | 
 59 |       def copy(): MaxDoubleParam = {
 60 |         val newAcc = new MaxDoubleParam()
 61 |         newAcc._value = _value
 62 |         newAcc
 63 |       }
 64 | 
 65 |       override def merge(other: AccumulatorV2[jl.Double, jl.Double]): Unit = other match {
 66 |         case o: MaxDoubleParam =>
 67 |           _value = Math.max(_value, o._value)
 68 |         case _ =>
 69 |           throw new UnsupportedOperationException(
 70 |             s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
 71 |       }
 72 | 
 73 |       override def value: jl.Double = _value
 74 |     }
 75 |     // Create an accumulator with the initial value of Double.MinValue
 76 |     val acc = new MaxDoubleParam()
 77 |     sc.register(acc)
 78 |     val transformed = rdd.map{x => acc.add(x.attributes(0)); (x.zip, x.id)}
 79 |     // accumulator still has Double.MinValue
 80 |     // Note: This example is dangerous since the transformation may be
 81 |     // evaluated multiple times.
 82 |     transformed.count() // force evaluation
 83 |     (transformed, acc.value)
 84 |   }
 85 |   //end::maxFuzzyAcc[]
 86 | 
 87 |   //tag::uniquePandaAcc[]
 88 |   def uniquePandas(sc: SparkContext, rdd: RDD[RawPanda]): HashSet[Long] = {
 89 |     class UniqParam extends AccumulatorV2[Long, HashSet[Long]] {
 90 |       val _values = new HashSet[Long]
 91 |       override def isZero() = _values.isEmpty
 92 | 
 93 |       override def copy(): UniqParam = {
 94 |         val nacc = new UniqParam
 95 |         nacc._values ++= _values
 96 |         nacc
 97 |       }
 98 | 
 99 |       override def reset(): Unit = {
100 |         _values.clear()
101 |       }
102 | 
103 |       override def merge(other: AccumulatorV2[Long, HashSet[Long]]): Unit = other match {
104 |         case o: UniqParam =>
105 |           _values ++= o._values
106 |         case _ =>
107 |           throw new UnsupportedOperationException(
108 |             s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
109 |       }
110 | 
111 |       override def value: HashSet[Long] = _values
112 |       // For adding new values
113 |       override def add(t: Long) = {
114 |         _values += t
115 |       }
116 |     }
117 |     // Create an accumulator with the initial value of Double.MinValue
118 |     val acc = new UniqParam()
119 |     sc.register(acc)
120 |     val transformed = rdd.map{x => acc.add(x.id); (x.zip, x.id)}
121 |     // accumulator still has zero values
122 |     transformed.count() // force evaluation
123 |     acc.value
124 |   }
125 |   //end::uniquePandaAcc[]
126 | }
127 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/MixedDataset.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * A sample mixing relational & functional transformations with Datasets.
  3 |  */
  4 | package com.highperformancespark.examples.dataframe
  5 | 
  6 | import org.apache.spark._
  7 | import org.apache.spark.rdd.RDD
  8 | import org.apache.spark.sql._
  9 | import org.apache.spark.sql.catalyst.expressions.aggregate._
 10 | import org.apache.spark.sql.expressions._
 11 | import org.apache.spark.sql.functions._
 12 | import org.apache.spark.sql.hive._
 13 | import org.apache.spark.sql.hive.thriftserver._
 14 | import org.apache.spark.sql.types._
 15 | 
 16 | case class MiniPandaInfo(zip: String, size: Double)
 17 | 
 18 | class MixedDataset(sqlCtx: SQLContext) {
 19 |   import sqlCtx.implicits._
 20 | 
 21 |   /**
 22 |    * A sample function on a Dataset of RawPandas.
 23 |    *
 24 |    * This is contrived, since our reduction could also be done with SQL aggregates,
 25 |    * but we can see the flexibility of being able to specify arbitrary Scala code.
 26 |    */
 27 |   def happyPandaSums(ds: Dataset[RawPanda]): Double = {
 28 |     ds.toDF().filter($"happy" === true).as[RawPanda].
 29 |       select($"attributes"(0).as[Double]).
 30 |       reduce((x, y) => x + y)
 31 |   }
 32 | 
 33 |   /**
 34 |    * A sample function on a Dataset of RawPandas.
 35 |    * Use the first attribute to deterimine if a panda is squishy.
 36 |    */
 37 |   //tag::basicSelect[]
 38 |   def squishyPandas(ds: Dataset[RawPanda]): Dataset[(Long, Boolean)] = {
 39 |     ds.select($"id".as[Long], ($"attributes"(0) > 0.5).as[Boolean])
 40 |   }
 41 |   //end::basicSelect[]
 42 | 
 43 |   /**
 44 |    * Union happy and sad pandas
 45 |    */
 46 |   //tag::basicUnion[]
 47 |   def unionPandas(happyPandas: Dataset[RawPanda], sadPandas: Dataset[RawPanda]) = {
 48 |     happyPandas.union(sadPandas)
 49 |   }
 50 |   //end::basicUnion[]
 51 | 
 52 |   /**
 53 |    * Functional map + Dataset, sums the positive attributes for the pandas
 54 |    */
 55 |   //tag::functionalQuery[]
 56 |   def funMap(ds: Dataset[RawPanda]): Dataset[Double] = {
 57 |     ds.map{rp => rp.attributes.filter(_ > 0).sum}
 58 |   }
 59 |   //end::functionalQuery[]
 60 | 
 61 |   //tag::maxPandaSizePerZip[]
 62 |   def maxPandaSizePerZip(ds: Dataset[RawPanda]): Dataset[(String, Double)] = {
 63 |     ds.map(rp => MiniPandaInfo(rp.zip, rp.attributes(2)))
 64 |       .groupByKey(mp => mp.zip).agg(max("size").as[Double])
 65 |   }
 66 |   //end::maxPandaSizePerZip[]
 67 | 
 68 |   //tag::maxPandaSizePerZipScala[]
 69 |   def maxPandaSizePerZipScala(ds: Dataset[RawPanda]): Dataset[(String, Double)] = {
 70 |     def groupMapFun(g: String, iter: Iterator[RawPanda]): (String, Double)  = {
 71 |       (g, iter.map(_.attributes(2)).reduceLeft(Math.max(_, _)))
 72 |     }
 73 |     ds.groupByKey(rp => rp.zip).mapGroups(groupMapFun)
 74 |   }
 75 |   //end::maxPandaSizePerZipScala[]
 76 | 
 77 |   /**
 78 |    * Illustrate how we make typed queries, using some of the float properties
 79 |    * to produce boolean values.
 80 |    */
 81 |   def typedQueryExample(ds: Dataset[RawPanda]): Dataset[Double] = {
 82 |     ds.select($"attributes"(0).as[Double])
 83 |   }
 84 | 
 85 |   /**
 86 |    * Illustrate Dataset joins
 87 |    */
 88 |   def joinSample(pandas: Dataset[RawPanda], coffeeShops: Dataset[CoffeeShop]):
 89 |       Dataset[(RawPanda, CoffeeShop)] = {
 90 |     //tag::joinWith[]
 91 |     val result: Dataset[(RawPanda, CoffeeShop)] = pandas.joinWith(coffeeShops,
 92 |       pandas("zip") === coffeeShops("zip"))
 93 |     //end::joinWith[]
 94 |     result
 95 |   }
 96 | 
 97 |   /**
 98 |    * Illustrate a self join to compare pandas in the same zip code
 99 |    */
100 |   def selfJoin(pandas: Dataset[RawPanda]):
101 |       Dataset[(RawPanda, RawPanda)] = {
102 |     //tag::selfJoin[]
103 |     val result: Dataset[(RawPanda, RawPanda)] = pandas.as("l").joinWith(pandas.as("r"),
104 |       $"l.zip" === $"r.zip")
105 |     //end::selfJoin[]
106 |     result
107 |   }
108 | 
109 |   //tag::fromRDD[]
110 |   /**
111 |    * Illustrate converting an RDD to DS
112 |    */
113 |   def fromRDD(rdd: RDD[RawPanda]): Dataset[RawPanda] = {
114 |     rdd.toDS
115 |   }
116 | 
117 |   //end::fromRDD[]
118 | 
119 |   //tag::toRDDDF[]
120 |   /**
121 |    * Illustrate converting a Dataset to an RDD
122 |    */
123 |   def toRDD(ds: Dataset[RawPanda]): RDD[RawPanda] = {
124 |     ds.rdd
125 |   }
126 | 
127 |   /**
128 |    * Illustrate converting a Dataset to a DataFrame
129 |    */
130 |   def toDF(ds: Dataset[RawPanda]): DataFrame = {
131 |     ds.toDF()
132 |   }
133 |   //end::toRDDDF[]
134 | 
135 |   /**
136 |    * Illustrate DataFrame to Dataset. Its important to note that if the schema
137 |    * does not match what is expected by the Dataset this fails fast.
138 |    */
139 |   //tag::DataFrameAsDataset[]
140 |   def fromDF(df: DataFrame): Dataset[RawPanda] = {
141 |     df.as[RawPanda]
142 |   }
143 |   //end::DataFrameAsDataset[]
144 | }
145 | 


--------------------------------------------------------------------------------
/python/examples/bad_pyspark.py:
--------------------------------------------------------------------------------
  1 | # This script triggers a number of different PySpark errors
  2 | 
  3 | from pyspark.sql.session import SparkSession
  4 | import sys
  5 | 
  6 | global sc
  7 | 
  8 | 
  9 | def nonExistentInput(sc):
 10 |     """
 11 |     Attempt to load non existent input
 12 |     >>> nonExistentInput(sc)
 13 |     Traceback (most recent call last):
 14 |         ...
 15 |     Py4JJavaError:...
 16 |     """
 17 |     # tag::nonExistent[]
 18 |     failedRdd = sc.textFile("file:///doesnotexist")
 19 |     failedRdd.count()
 20 |     # end::nonExistent[]
 21 | 
 22 | 
 23 | def throwOuter(sc):
 24 |     """
 25 |     Attempt to load non existant input
 26 |     >>> throwOuter(sc)
 27 |     Traceback (most recent call last):
 28 |         ...
 29 |     Py4JJavaError:...
 30 |     """
 31 |     # tag::throwOuter[]
 32 |     data = sc.parallelize(range(10))
 33 |     transform1 = data.map(lambda x: x + 1)
 34 |     transform2 = transform1.map(lambda x: x / 0)
 35 |     transform2.count()
 36 |     # end::throwOuter[]
 37 | 
 38 | 
 39 | def throwInner(sc):
 40 |     """
 41 |     Attempt to load non existant input
 42 |     >>> throwInner(sc)
 43 |     Traceback (most recent call last):
 44 |         ...
 45 |     Py4JJavaError:...
 46 |     """
 47 |     # tag::throwInner[]
 48 |     data = sc.parallelize(range(10))
 49 |     transform1 = data.map(lambda x: x / 0)
 50 |     transform2 = transform1.map(lambda x: x + 1)
 51 |     transform2.count()
 52 |     # end::throwInner[]
 53 | 
 54 | 
 55 | # tag::rewrite[]
 56 | def add1(x):
 57 |     """
 58 |     Add 1
 59 |     >>> add1(2)
 60 |     3
 61 |     """
 62 |     return x + 1
 63 | 
 64 | 
 65 | def divZero(x):
 66 |     """
 67 |     Divide by zero (cause an error)
 68 |     >>> divZero(2)
 69 |     Traceback (most recent call last):
 70 |         ...
 71 |     ZeroDivisionError: integer division or modulo by zero
 72 |     """
 73 |     return x / 0
 74 | 
 75 | 
 76 | def throwOuter2(sc):
 77 |     """
 78 |     Attempt to load non existant input
 79 |     >>> throwOuter2(sc)
 80 |     Traceback (most recent call last):
 81 |         ...
 82 |     Py4JJavaError:...
 83 |     """
 84 |     data = sc.parallelize(range(10))
 85 |     transform1 = data.map(add1)
 86 |     transform2 = transform1.map(divZero)
 87 |     transform2.count()
 88 | 
 89 | 
 90 | def throwInner2(sc):
 91 |     """
 92 |     Attempt to load non existant input
 93 |     >>> throwInner2(sc)
 94 |     Traceback (most recent call last):
 95 |         ...
 96 |     Py4JJavaError:...
 97 |     """
 98 |     data = sc.parallelize(range(10))
 99 |     transform1 = data.map(divZero)
100 |     transform2 = transform1.map(add1)
101 |     transform2.count()
102 | 
103 | 
104 | # end::rewrite[]
105 | 
106 | 
107 | def throwInner3(sc):
108 |     """
109 |     Attempt to load non existant input
110 |     >>> throwInner3(sc)
111 |     Reject 10
112 |     """
113 |     data = sc.parallelize(range(10))
114 |     rejectedCount = sc.accumulator(0)
115 | 
116 |     def loggedDivZero(x):
117 |         import logging
118 | 
119 |         try:
120 |             return [x / 0]
121 |         except Exception as e:
122 |             rejectedCount.add(1)
123 |             logging.warning("Error found " + repr(e))
124 |             return []
125 | 
126 |     transform1 = data.flatMap(loggedDivZero)
127 |     transform2 = transform1.map(add1)
128 |     transform2.count()
129 |     print("Reject " + str(rejectedCount.value))
130 | 
131 | 
132 | def runOutOfMemory(sc):
133 |     """
134 |     Run out of memory on the workers from a skewed shuffle.
135 |     >>> runOutOfMemory(sc) # doctest: +SKIP
136 |     Traceback (most recent call last):
137 |         ...
138 |     Py4JJavaError:...
139 |     """
140 |     # tag::worker_oom[]
141 |     data = sc.parallelize(range(10000))
142 | 
143 |     def generate_too_much(i: int):
144 |         return list(map(lambda v: (i % 2, v), range(100000 * i)))
145 | 
146 |     bad = data.flatMap(generate_too_much).groupByKey()
147 |     bad.count()
148 |     # end::worker_oom[]
149 | 
150 | 
151 | def _setupTest():
152 |     globs = globals()
153 |     spark = SparkSession.builder.master("local[4]").getOrCreate()
154 |     sc = spark._sc
155 |     globs["sc"] = sc
156 |     return globs
157 | 
158 | 
159 | def _test():
160 |     """
161 |     Run the tests.
162 |     Note this will print a lot of error message to stderr since we don't
163 |     capture the JVM sub process stdout/stderr for doctests.
164 |     """
165 |     import doctest
166 | 
167 |     globs = _setupTest()
168 |     (failure_count, test_count) = doctest.testmod(
169 |         globs=globs, optionflags=doctest.ELLIPSIS
170 |     )
171 |     print("All tests done, stopping Spark context.")
172 |     globs["sc"].stop()
173 |     if failure_count:
174 |         exit(-1)
175 |     else:
176 |         exit(0)
177 | 
178 | 
179 | if __name__ == "__main__":
180 |     _test()
181 | # Hack to support running in nose
182 | elif sys.stdout != sys.__stdout__:
183 |     _setupTest()
184 | 


--------------------------------------------------------------------------------
/core/src/main/java/com/highperformancespark/examples/dataframe/JavaLoadSave.java:
--------------------------------------------------------------------------------
  1 | package com.highperformancespark.examples.dataframe;
  2 | 
  3 | import com.highperformancespark.examples.objects.JavaPandaPlace;
  4 | import com.highperformancespark.examples.objects.JavaRawPanda;
  5 | import org.apache.spark.api.java.JavaRDD;
  6 | import org.apache.spark.sql.*;
  7 | import org.apache.spark.sql.types.*;
  8 | 
  9 | import java.util.List;
 10 | import java.util.Properties;
 11 | import java.util.stream.Collectors;
 12 | 
 13 | public class JavaLoadSave {
 14 |   private SQLContext sqlContext;
 15 | 
 16 |   public JavaLoadSave(SQLContext sqlContext) {
 17 |     this.sqlContext = sqlContext;
 18 |   }
 19 | 
 20 |   //tag::createFromRDD[]
 21 |   public Dataset<Row> createFromJavaBean(JavaRDD<JavaPandaPlace> input) {
 22 |     // Create DataFrame using Java Bean
 23 |     Dataset<Row> df1 = sqlContext.createDataFrame(input, JavaPandaPlace.class);
 24 | 
 25 |     // Create DataFrame using JavaRDD<Row>
 26 |     JavaRDD<Row> rowRDD = input.map(pm -> RowFactory.create(pm.getName(),
 27 |       pm.getPandas().stream()
 28 |       .map(pi -> RowFactory.create(pi.getId(), pi.getZip(), pi.isHappy(), pi.getAttributes()))
 29 |       .collect(Collectors.toList())));
 30 | 
 31 |     ArrayType pandasType = DataTypes.createArrayType(new StructType(
 32 |       new StructField[]{
 33 |         new StructField("id", DataTypes.LongType, true, Metadata.empty()),
 34 |         new StructField("zip", DataTypes.StringType, true, Metadata.empty()),
 35 |         new StructField("happy", DataTypes.BooleanType, true, Metadata.empty()),
 36 |         new StructField("attributes", DataTypes.createArrayType(DataTypes.FloatType), true, Metadata.empty())
 37 |       }
 38 |     ));
 39 | 
 40 |     StructType schema = new StructType(new StructField[]{
 41 |       new StructField("name", DataTypes.StringType, true, Metadata.empty()),
 42 |       new StructField("pandas", pandasType, true, Metadata.empty())
 43 |     });
 44 | 
 45 |     Dataset<Row> df2 = sqlContext.createDataFrame(rowRDD, schema);
 46 |     return df2;
 47 |   }
 48 |   //end::createFromRDD[]
 49 | 
 50 |   //tag::createFromLocal[]
 51 |   public Dataset<Row> createFromLocal(List<PandaPlace> input) {
 52 |     return sqlContext.createDataFrame(input, PandaPlace.class);
 53 |   }
 54 |   //end::createFromLocal[]
 55 | 
 56 |   //tag::collectResults[]
 57 |   public List<Row> collectDF(Dataset<Row> df) {
 58 |     return df.collectAsList();
 59 |   }
 60 |   //end::collectResults[]
 61 | 
 62 |   //tag::toRDD[]
 63 |   public JavaRDD<JavaRawPanda> toRDD(Dataset<Row> input) {
 64 |     JavaRDD<JavaRawPanda> rdd = input.javaRDD().map(row -> new JavaRawPanda(row.getLong(0), row.getString(1),
 65 |       row.getString(2), row.getBoolean(3), row.getList(4)));
 66 |     return rdd;
 67 |   }
 68 |   //end::toRDD[]
 69 | 
 70 |   //tag::partitionedOutput[]
 71 |   public void writeOutByZip(Dataset<Row> input) {
 72 |     input.write().partitionBy("zipcode").format("json").save("output/");
 73 |   }
 74 |   //end::partitionedOutput[]
 75 | 
 76 |   //tag::saveAppend[]
 77 |   public void writeAppend(Dataset<Row> input) {
 78 |     input.write().mode(SaveMode.Append).save("output/");
 79 |   }
 80 |   //end::saveAppend[]
 81 | 
 82 |   public Dataset<Row> createJDBC() {
 83 |     //tag::createJDBC[]
 84 |     Dataset<Row> df1 = sqlContext.read().jdbc("jdbc:dialect:serverName;user=user;password=pass",
 85 |       "table", new Properties());
 86 | 
 87 |     Dataset<Row> df2 = sqlContext.read().format("jdbc")
 88 |       .option("url", "jdbc:dialect:serverName")
 89 |       .option("dbtable", "table").load();
 90 | 
 91 |     return df2;
 92 |     //end::createJDBC[]
 93 |   }
 94 | 
 95 |   public void writeJDBC(Dataset<Row> df) {
 96 |     //tag::writeJDBC[]
 97 |     df.write().jdbc("jdbc:dialect:serverName;user=user;password=pass",
 98 |       "table", new Properties());
 99 | 
100 |     df.write().format("jdbc")
101 |       .option("url", "jdbc:dialect:serverName")
102 |       .option("user", "user")
103 |       .option("password", "pass")
104 |       .option("dbtable", "table").save();
105 |     //end::writeJDBC[]
106 |   }
107 | 
108 |   //tag::loadParquet[]
109 |   public Dataset<Row> loadParquet(String path) {
110 |     // Configure Spark to read binary data as string, note: must be configured on SQLContext
111 |     sqlContext.setConf("spark.sql.parquet.binaryAsString", "true");
112 | 
113 |     // Load parquet data using merge schema (configured through option)
114 |     Dataset<Row> df = sqlContext.read()
115 |       .option("mergeSchema", "true")
116 |       .format("parquet")
117 |       .load(path);
118 | 
119 |     return df;
120 |   }
121 |   //end::loadParquet[]
122 | 
123 |   //tag::writeParquet[]
124 |   public void writeParquet(Dataset<Row> df, String path) {
125 |     df.write().format("parquet").save(path);
126 |   }
127 |   //end::writeParquet[]
128 | 
129 |   //tag::loadHiveTable[]
130 |   public Dataset<Row> loadHiveTable() {
131 |     return sqlContext.read().table("pandas");
132 |   }
133 |   //end::loadHiveTable[]
134 | 
135 |   //tag::saveManagedTable[]
136 |   public void saveManagedTable(Dataset<Row> df) {
137 |     df.write().saveAsTable("pandas");
138 |   }
139 |   //end::saveManagedTable[]
140 | }
141 | 


--------------------------------------------------------------------------------
/python/examples/simple_perf.py:
--------------------------------------------------------------------------------
  1 | # When running this example make sure to include the built Scala jar :
  2 | #
  3 | # $SPARK_HOME/bin/pyspark --jars \
  4 | # ./target/examples-0.0.1.jar --driver-class-path ./target/examples-0.0.1.jar
  5 | #
  6 | # This example illustrates how to interface Scala and Python code, but caution
  7 | # should be taken as it depends on many private members that may change in
  8 | # future releases of Spark.
  9 | 
 10 | from pyspark.sql.types import StructType, IntegerType, DoubleType, StructField
 11 | from pyspark.sql import DataFrame, SparkSession
 12 | import sys
 13 | import timeit
 14 | import time
 15 | 
 16 | 
 17 | def generate_scale_data(sqlCtx, rows, numCols):
 18 |     """
 19 |     Generate scale data for the performance test.
 20 | 
 21 |     This also illustrates calling custom Scala code from the driver.
 22 | 
 23 |     .. Note: This depends on many internal methods and may break between versions.
 24 | 
 25 |     # This assumes our jars have been added with export PYSPARK_SUBMIT_ARGS
 26 |     >>> session = SparkSession.builder.getOrCreate()
 27 |     >>> scaleData = generate_scale_data(session, 100L, 1)
 28 |     >>> scaleData[0].count()
 29 |     100
 30 |     >>> scaleData[1].count()
 31 |     100
 32 |     >>> session.stop()
 33 |     """
 34 |     # tag::javaInterop[]
 35 |     sc = sqlCtx._sc
 36 |     javaSparkSession = sqlCtx._jsparkSession
 37 |     jsc = sc._jsc
 38 |     scalasc = jsc.sc()
 39 |     gateway = sc._gateway
 40 |     # Call a java method that gives us back an RDD of JVM Rows (Int, Double)
 41 |     # While Python RDDs are wrapped Java RDDs (even of Rows) the contents are
 42 |     # different, so we can't directly wrap this.
 43 |     # This returns a Java RDD of Rows - normally it would better to
 44 |     # return a DataFrame directly, but for illustration we will work
 45 |     # with an RDD of Rows.
 46 |     java_rdd = gateway.jvm.com.highperformancespark.examples.tools.GenerateScalingData.generateMiniScaleRows(
 47 |         scalasc, rows, numCols
 48 |     )
 49 |     # Schemas are serialized to JSON and sent back and forth
 50 |     # Construct a Python Schema and turn it into a Java Schema
 51 |     schema = StructType(
 52 |         [StructField("zip", IntegerType()), StructField("fuzzyness", DoubleType())]
 53 |     )
 54 |     jschema = javaSparkSession.parseDataType(schema.json())
 55 |     # Convert the Java RDD to Java DataFrame
 56 |     java_dataframe = javaSparkSession.createDataFrame(java_rdd, jschema)
 57 |     # Wrap the Java DataFrame into a Python DataFrame
 58 |     python_dataframe = DataFrame(java_dataframe, sqlCtx)
 59 |     # Convert the Python DataFrame into an RDD
 60 |     pairRDD = python_dataframe.rdd.map(lambda row: (row[0], row[1]))
 61 |     return (python_dataframe, pairRDD)
 62 |     # end::javaInterop[]
 63 | 
 64 | 
 65 | def runOnDF(df):
 66 |     result = df.groupBy("zip").avg("fuzzyness").count()
 67 |     return result
 68 | 
 69 | 
 70 | def runOnRDD(rdd):
 71 |     result = (
 72 |         rdd.map(lambda xy: (xy[0], (xy[1], 1)))
 73 |         .reduceByKey(lambda x, y: (x[0] + y[0], x[1] + y[1]))
 74 |         .count()
 75 |     )
 76 |     return result
 77 | 
 78 | 
 79 | def groupOnRDD(rdd):
 80 |     return rdd.groupByKey().mapValues(lambda v: sum(v) / float(len(v))).count()
 81 | 
 82 | 
 83 | def run(sc, sqlCtx, scalingFactor, size):
 84 |     """
 85 |     Run the simple perf test printing the results to stdout.
 86 | 
 87 |     >>> session = SparkSession.builder.getOrCreate()
 88 |     >>> sc = session._sc
 89 |     >>> run(sc, session, 5L, 1)
 90 |     RDD:
 91 |     ...
 92 |     group:
 93 |     ...
 94 |     df:
 95 |     ...
 96 |     yay
 97 |     >>> session.stop()
 98 |     """
 99 |     (input_df, input_rdd) = generate_scale_data(sqlCtx, scalingFactor, size)
100 |     input_rdd.cache().count()
101 |     rddTimeings = timeit.repeat(
102 |         stmt=lambda: runOnRDD(input_rdd),
103 |         repeat=10,
104 |         number=1,
105 |         timer=time.time,
106 |         setup="gc.enable()",
107 |     )
108 |     groupTimeings = timeit.repeat(
109 |         stmt=lambda: groupOnRDD(input_rdd),
110 |         repeat=10,
111 |         number=1,
112 |         timer=time.time,
113 |         setup="gc.enable()",
114 |     )
115 |     input_df.cache().count()
116 |     dfTimeings = timeit.repeat(
117 |         stmt=lambda: runOnDF(input_df),
118 |         repeat=10,
119 |         number=1,
120 |         timer=time.time,
121 |         setup="gc.enable()",
122 |     )
123 |     print(f"RDD: {rddTimeings}, group: {groupTimeings}, df: {dfTimeings}")
124 | 
125 | 
126 | def parseArgs(args):
127 |     """
128 |     Parse the args, no error checking.
129 | 
130 |     >>> parseArgs(["foobaz", "1", "2"])
131 |     (1, 2)
132 |     """
133 |     scalingFactor = int(args[1])
134 |     size = int(args[2])
135 |     return (scalingFactor, size)
136 | 
137 | 
138 | if __name__ == "__main__":
139 |     """
140 |     Usage: simple_perf_test scalingFactor size
141 |     """
142 | 
143 |     scalingFactor = 1
144 |     size = 1
145 |     if len(sys.argv) > 2:
146 |         (scalingFactor, size) = parseArgs(sys.argv)
147 |     session = SparkSession.builder.appName("SimplePythonPerf").getOrCreate()
148 |     sc = session._sc
149 |     run(sc, session, scalingFactor, size)
150 | 
151 |     sc.stop()
152 | 


--------------------------------------------------------------------------------
/core/src/test/scala/com/high-performance-spark-examples/goldilocks/GoldilocksLargeTests.scala:
--------------------------------------------------------------------------------
  1 | package com.highperformancespark.examples.goldilocks
  2 | 
  3 | import scala.collection.immutable.IndexedSeq
  4 | 
  5 | import org.apache.spark.SparkContext
  6 | import org.apache.spark.rdd.RDD
  7 | import org.apache.spark.sql.DataFrame
  8 | import org.apache.spark.sql.Row
  9 | import org.apache.spark.sql.SQLContext
 10 | import org.apache.spark.sql.types.DoubleType
 11 | import org.apache.spark.sql.types.StructField
 12 | import org.apache.spark.sql.types.StructType
 13 | 
 14 | import com.holdenkarau.spark.testing.SharedSparkContext
 15 | import org.scalatest.funsuite.AnyFunSuite
 16 | import org.apache.spark.sql.SparkSession
 17 | 
 18 | class GoldilocksLargeTests extends AnyFunSuite with SharedSparkContext{
 19 | 
 20 | 
 21 |   def testGoldilocksImplementations(
 22 |     data: DataFrame, targetRanks: List[Long],
 23 |     expectedResult: Map[Int, Iterable[Long]]) = {
 24 | 
 25 |     val iterative =
 26 |       GoldilocksWhileLoop.findRankStatistics(data, targetRanks)
 27 |     val groupByKey =
 28 |       GoldilocksGroupByKey.findRankStatistics(data, targetRanks)
 29 |     val firstTry =
 30 |       GoldilocksFirstTry.findRankStatistics(data, targetRanks)
 31 |     val hashMap =
 32 |       GoldilocksWithHashMap.findRankStatistics(data, targetRanks)
 33 |     val secondarySort =
 34 |       GoldilocksSecondarySort.findRankStatistics(data, targetRanks,
 35 |         data.rdd.partitions.length)
 36 |     val secondarySortV2 =
 37 |       GoldilocksSecondarySortV2.findRankStatistics(data, targetRanks)
 38 | 
 39 |     expectedResult.foreach {
 40 |       case((i, ranks)) =>
 41 |         assert(iterative(i).equals(ranks),
 42 |           "The Iterative solution to goldilocks was incorrect for column " + i)
 43 |         assert(groupByKey(i).equals(ranks),
 44 |           "Group by key solution was incorrect")
 45 |         assert(firstTry(i).equals(ranks),
 46 |           "GoldilocksFirstTry incorrect for column " + i )
 47 |         assert(hashMap(i).equals(ranks),
 48 |           "GoldilocksWithhashMap incorrect for column " + i)
 49 |         assert(secondarySort(i).equals(ranks))
 50 |         assert(secondarySortV2(i).equals(ranks))
 51 | 
 52 |     }
 53 |   }
 54 | 
 55 |   test("Goldilocks on local data solution "){
 56 |     val sqlContext = SparkSession.builder.getOrCreate().sqlContext
 57 |     val testRanks = List(3L, 8L)
 58 |     val (smallTestData, result) =
 59 |       DataCreationUtils.createLocalTestData(5, 10, testRanks)
 60 |     val schema = StructType(
 61 |       result.keys.toSeq.map(
 62 |         n => StructField("Column" + n.toString, DoubleType)
 63 |       ))
 64 |     val smallTestDF: DataFrame =
 65 |       sqlContext.createDataFrame(sc.makeRDD(smallTestData), schema)
 66 |     testGoldilocksImplementations(smallTestDF, testRanks, result)
 67 |   }
 68 | }
 69 | 
 70 | object DataCreationUtils {
 71 |   def createLocalTestData(numberCols: Int, numberOfRows: Int,
 72 |     targetRanks: List[Long]) = {
 73 | 
 74 |     val cols = Range(0,numberCols).toArray
 75 |     val scalers = cols.map(x => 1.0)
 76 |     val rowRange = Range(0, numberOfRows)
 77 |     val columnArray: Array[IndexedSeq[Double]] = cols.map(
 78 |       columnIndex => {
 79 |         val columnValues = rowRange.map(
 80 |           x => (Math.random(), x)).sortBy(_._1).map(_._2 * scalers(columnIndex))
 81 |         columnValues
 82 |       })
 83 |     val rows = rowRange.map(
 84 |       rowIndex => {
 85 |         Row.fromSeq(cols.map( colI => columnArray(colI)(rowIndex)).toSeq)
 86 |       })
 87 | 
 88 | 
 89 |     val result: Map[Int, Iterable[Long]] = cols.map(i => {
 90 |       (i, targetRanks.map(r => Math.round((r-1)/scalers(i))))
 91 |     }).toMap
 92 | 
 93 |     (rows, result)
 94 |   }
 95 | 
 96 | 
 97 |   def createDistributedData(sc: SparkContext, partitions: Int,
 98 |     elementsPerPartition: Int, numberOfColumns: Int ) = {
 99 |     val partitionsStart: RDD[Int] = sc.parallelize(
100 |       Array.fill(partitions)(1))
101 |     partitionsStart.repartition(partitions)
102 | 
103 |     var data: RDD[(Long, List[Int])] = partitionsStart.mapPartitionsWithIndex {
104 |       case (partIndex, elements) =>
105 |         val rows = Range(0, elementsPerPartition)
106 |           .map(x => (Math.random(), x))
107 |           .map {
108 |           case ((randomNumber, rowValue)) =>
109 |             (randomNumber,
110 |               //index of element
111 |               (partIndex * elementsPerPartition.toLong + rowValue,
112 |                 List(rowValue + partIndex * elementsPerPartition)))
113 |         }
114 |         rows.toIterator
115 |     }.sortByKey().values
116 | 
117 | 
118 |     Range(0, numberOfColumns).foreach(x => {
119 |       val nextColumn: RDD[(Long, Int)] = partitionsStart.mapPartitionsWithIndex {
120 |         case (partIndex, elements) =>
121 |           val rows = Range(0, elementsPerPartition)
122 |             .map(x => (Math.random(), x))
123 |             .map {
124 |             case ((randomNumber, rowValue)) =>
125 |               (randomNumber,
126 |                 //index of element
127 |                 (partIndex * elementsPerPartition.toLong + rowValue,
128 |                   rowValue + partIndex * elementsPerPartition))
129 |           }
130 |           rows.toIterator
131 |       }.sortByKey().values
132 | 
133 |       data = nextColumn.join(data).mapValues(x => x._1 :: x._2)
134 |     })
135 |     data
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/goldilocks/RDDJoinExamples.scala:
--------------------------------------------------------------------------------
  1 | package  com.highperformancespark.examples.goldilocks
  2 | 
  3 | import scala.collection.Map
  4 | import scala.reflect.ClassTag
  5 | 
  6 | import org.apache.spark.HashPartitioner
  7 | import org.apache.spark.rdd.RDD
  8 | 
  9 | object RDDJoinExamples {
 10 | 
 11 |  /* For Example, suppose we have one RDD with some data in the form (Panda id, score)
 12 |  and another RDD with (Panda id, address), and we want to send each Panda some mail
 13 |  with her best score. We could join the RDDs on ID and then compute the best score
 14 |  for each address. Like this:
 15 | 
 16 |   'ToDo: Insert Example'
 17 | 
 18 |   However, this is  slower than first reducing the score data, so that the
 19 |   //first dataset contains only one row for each Panda with her best score and then
 20 |    //joining that data with the address data.
 21 | 
 22 |   'ToDO: Insert an example of this' */
 23 |  //tag::joinScoresWithAddress[]
 24 |   def joinScoresWithAddress1( scoreRDD : RDD[(Long, Double)],
 25 |    addressRDD : RDD[(Long, String )]) : RDD[(Long, (Double, String))]= {
 26 |     val joinedRDD = scoreRDD.join(addressRDD)
 27 |     joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y )
 28 |   }
 29 |   //end::joinScoresWithAddress[]
 30 | 
 31 |   //tag::leftOuterJoinScoresWithAddress[]
 32 |   def outerJoinScoresWithAddress(scoreRDD : RDD[(Long, Double)],
 33 |    addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, Option[String]))]= {
 34 |     val joinedRDD = scoreRDD.leftOuterJoin(addressRDD)
 35 |     joinedRDD.reduceByKey( (x, y) => if(x._1 > y._1) x else y )
 36 |   }
 37 |   //end::leftOuterJoinScoresWithAddress[]
 38 | 
 39 |   //tag::joinScoresWithAddressFast[]
 40 |   def joinScoresWithAddress2(scoreRDD : RDD[(Long, Double)],
 41 |     addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, String))]= {
 42 |    val bestScoreData = scoreRDD.reduceByKey((x, y) => if(x > y) x else y)
 43 |    bestScoreData.join(addressRDD)
 44 |   }
 45 |   //end::joinScoresWithAddressFast[]
 46 | /*
 47 |  We could make the example in the previous section even faster,
 48 |  by using the partitioner for the address data as an argument for
 49 |  the reduce by key step.
 50 |  'ToDO: Insert the code to show this here' */
 51 |   //tag::joinScoresWithAddress3[]
 52 |   def joinScoresWithAddress3(scoreRDD: RDD[(Long, Double)],
 53 |    addressRDD: RDD[(Long, String)]) : RDD[(Long, (Double, String))]= {
 54 |     // If addressRDD has a known partitioner we should use that,
 55 |     // otherwise it has a default hash parttioner, which we can reconstruct by
 56 |     // getting the number of partitions.
 57 |     val addressDataPartitioner = addressRDD.partitioner match {
 58 |       case (Some(p)) => p
 59 |       case (None) => new HashPartitioner(addressRDD.partitions.length)
 60 |     }
 61 |     val bestScoreData = scoreRDD.reduceByKey(addressDataPartitioner,
 62 |       (x, y) => if(x > y) x else y)
 63 |     bestScoreData.join(addressRDD)
 64 |   }
 65 |  //end::joinScoresWithAddress3[]
 66 | 
 67 |   def debugString(scoreRDD: RDD[(Long, Double)],
 68 |     addressRDD: RDD[(Long, String)])  = {
 69 |     //tag::debugString[]
 70 |     scoreRDD.join(addressRDD).toDebugString
 71 |     //end::debugString[]
 72 |   }
 73 | 
 74 |  /*
 75 |   *  Suppose we had two datasets of information about each panda,
 76 |   *  one with the scores, and one with there favorite foods.
 77 |   *  We could use cogroup to associate each Pandas id with an iterator
 78 |   *  of their scores and another iterator of their favorite foods.
 79 |   */
 80 |  def coGroupExample(scoreRDD: RDD[(Long, Double)], foodRDD: RDD[(Long, String)],
 81 |   addressRDD: RDD[(Long, String)]) = {
 82 |    //tag::coGroupExample1[]
 83 |    val cogroupedRDD: RDD[(Long, (Iterable[Double], Iterable[String]))] =
 84 |      scoreRDD.cogroup(foodRDD)
 85 |    //end::coGroupExample1[]
 86 | 
 87 |    /*
 88 |     * For example, if we needed to join the panda score data with both address
 89 |     * and favorite foods, it would be better to use co group than two
 90 |     * join operations.
 91 |     */
 92 |    //tag::coGroupExample2[]
 93 |    val addressScoreFood = addressRDD.cogroup(scoreRDD, foodRDD)
 94 |    //end::coGroupExample2[]
 95 |  }
 96 | 
 97 |  /**
 98 |    * Performs a broadcast hash join for two RDDs.
 99 |    * @param bigRDD - the first rdd, should be the larger RDD
100 |    * @param smallRDD - the small rdd, should be small enough to fit in memory
101 |    * @tparam K - The type of the key
102 |    * @tparam V1 - The type of the values for the large array
103 |    * @tparam V2 - The type of the values for the second array
104 |    * @return
105 |    */
106 |  //tag::coreBroadcast[]
107 |  def manualBroadcastHashJoin[K : Ordering : ClassTag, V1 : ClassTag,
108 |  V2 : ClassTag](bigRDD : RDD[(K, V1)],
109 |   smallRDD : RDD[(K, V2)])= {
110 |   val smallRDDLocal: Map[K, V2] = smallRDD.collectAsMap()
111 |   val smallRDDLocalBcast = bigRDD.sparkContext.broadcast(smallRDDLocal)
112 |   bigRDD.mapPartitions(iter => {
113 |    iter.flatMap{
114 |     case (k,v1 ) =>
115 |      smallRDDLocalBcast.value.get(k) match {
116 |       // Note: You could switch this to a left join by changing the empty seq
117 |       // to instead return Seq(k, Seq.empty[(V1, V2)])
118 |       case None => Seq.empty[(K, (V1, V2))]
119 |       case Some(v2) => Seq((k, (v1, v2)))
120 |      }
121 |    }
122 |   }, preservesPartitioning = true)
123 |  }
124 |  //end::coreBroadcast[]
125 | }
126 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/dataframe/LoadSave.scala:
--------------------------------------------------------------------------------
  1 | /**
  2 |  * Load and save data to/from DataFrames
  3 |  */
  4 | package com.highperformancespark.examples.dataframe
  5 | 
  6 | import java.util.Properties
  7 | 
  8 | import org.apache.spark.SparkContext
  9 | import org.apache.spark.rdd._
 10 | import org.apache.spark.sql._
 11 | import org.apache.spark.sql.types._
 12 | 
 13 | case class LoadSave(sc: SparkContext, session: SparkSession) {
 14 |   import session.implicits._
 15 |   //tag::createFromRDD[]
 16 |   def createFromCaseClassRDD(input: RDD[PandaPlace]) = {
 17 |     // Create DataFrame explicitly using session and schema inference
 18 |     val df1 = session.createDataFrame(input)
 19 | 
 20 |     // Create DataFrame using session implicits and schema inference
 21 |     val df2 = input.toDF()
 22 | 
 23 |     // Create a Row RDD from our RDD of case classes
 24 |     val rowRDD = input.map(pm => Row(pm.name,
 25 |       pm.pandas.map(pi => Row(pi.id, pi.zip, pi.happy, pi.attributes))))
 26 | 
 27 |     val pandasType = ArrayType(StructType(List(
 28 |       StructField("id", LongType, true),
 29 |       StructField("zip", StringType, true),
 30 |       StructField("happy", BooleanType, true),
 31 |       StructField("attributes", ArrayType(FloatType), true))))
 32 | 
 33 |     // Create DataFrame explicitly with specified schema
 34 |     val schema = StructType(List(StructField("name", StringType, true),
 35 |       StructField("pandas", pandasType)))
 36 | 
 37 |     val df3 = session.createDataFrame(rowRDD, schema)
 38 |   }
 39 |   //end::createFromRDD[]
 40 | 
 41 |   //tag::createFromRDDBasic[]
 42 |   def createFromCaseClassRDD(input: Seq[PandaPlace]) = {
 43 |     val rdd = sc.parallelize(input)
 44 |     // Create DataFrame explicitly using session and schema inference
 45 |     val df1 = session.createDataFrame(input)
 46 |   }
 47 |   //end::createFromRDDBasic[]
 48 | 
 49 |   //tag::createGetSchema[]
 50 |   def createAndPrintSchema() = {
 51 |     val damao = RawPanda(1, "M1B 5K7", "giant", true, Array(0.1, 0.1))
 52 |     val pandaPlace = PandaPlace("toronto", Array(damao))
 53 |     val df = session.createDataFrame(Seq(pandaPlace))
 54 |     df.printSchema()
 55 |   }
 56 |   //end::createGetSchema[]
 57 | 
 58 |   //tag::createFromLocal[]
 59 |   def createFromLocal(input: Seq[PandaPlace]) = {
 60 |     session.createDataFrame(input)
 61 |   }
 62 |   //end::createFromLocal[]
 63 | 
 64 |   //tag::collectResults[]
 65 |   def collectDF(df: DataFrame) = {
 66 |     val result: Array[Row] = df.collect()
 67 |     result
 68 |   }
 69 |   //end::collectResults[]
 70 | 
 71 |   //tag::toRDD[]
 72 |   def toRDD(input: DataFrame): RDD[RawPanda] = {
 73 |     val rdd: RDD[Row] = input.rdd
 74 |     rdd.map(row => RawPanda(row.getAs[Long](0), row.getAs[String](1),
 75 |       row.getAs[String](2), row.getAs[Boolean](3), row.getAs[Array[Double]](4)))
 76 |   }
 77 |   //end::toRDD[]
 78 | 
 79 |   //tag::partitionedOutput[]
 80 |   def writeOutByZip(input: DataFrame): Unit = {
 81 |     input.write.partitionBy("zipcode").format("json").save("output/")
 82 |   }
 83 |   //end::partitionedOutput[]
 84 | 
 85 |   //tag::saveAppend[]
 86 |   def writeAppend(input: DataFrame): Unit = {
 87 |     input.write.mode(SaveMode.Append).save("output/")
 88 |   }
 89 |   //end::saveAppend[]
 90 | 
 91 |   def upsertPandas(input: DataFrame): Unit = {
 92 |     //tag::upsert[]
 93 |     input.mergeInto("pandaInfo", $"source.id" === $"target.id")
 94 |          .whenMatched() // Note you can override the general match condition above if desired
 95 |          .updateAll()
 96 |          .whenNotMatched()
 97 |          .insertAll()
 98 |     //end::upsert[]
 99 |   }
100 | 
101 |   def createJDBC() = {
102 |     session.read.jdbc("jdbc:dialect:serverName;user=user;password=pass",
103 |       "table", new Properties)
104 | 
105 |     //tag::createJDBC[]
106 |     session.read.format("jdbc")
107 |       .option("url", "jdbc:dialect:serverName")
108 |       .option("dbtable", "table").load()
109 |     //end::createJDBC[]
110 |   }
111 | 
112 |   def writeJDBC(df: DataFrame) = {
113 |     df.write.jdbc("jdbc:dialect:serverName;user=user;password=pass",
114 |       "table", new Properties)
115 | 
116 |     //tag::writeJDBC[]
117 |     df.write.format("jdbc")
118 |       .option("url", "jdbc:dialect:serverName")
119 |       .option("user", "user")
120 |       .option("password", "pass")
121 |       .option("dbtable", "table").save()
122 |     //end::writeJDBC[]
123 |   }
124 | 
125 |   //tag::loadParquet[]
126 |   def loadParquet(path: String): DataFrame = {
127 |     // Configure Spark to read binary data as string,
128 |     // note: must be configured on session.
129 |     session.conf.set("spark.sql.parquet.binaryAsString", "true")
130 | 
131 |     // Load parquet data using merge schema (configured through option)
132 |     session.read
133 |       .option("mergeSchema", "true")
134 |       .format("parquet")
135 |       .load(path)
136 |   }
137 |   //end::loadParquet[]
138 | 
139 |   //tag::writeParquet[]
140 |   def writeParquet(df: DataFrame, path: String) = {
141 |     df.write.format("parquet").save(path)
142 |   }
143 |   //end::writeParquet[]
144 | 
145 |   //tag::loadHiveTable[]
146 |   def loadHiveTable(): DataFrame = {
147 |     session.read.table("pandas")
148 |   }
149 |   //end::loadHiveTable[]
150 | 
151 |   //tag::saveManagedTable[]
152 |   def saveManagedTable(df: DataFrame): Unit = {
153 |     df.write.saveAsTable("pandas")
154 |   }
155 |   //end::saveManagedTable[]
156 | }
157 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala:
--------------------------------------------------------------------------------
  1 | package com.highperformancespark.examples.mllib
  2 | 
  3 | import scala.collection.Map
  4 | 
  5 | import org.apache.spark._
  6 | import org.apache.spark.mllib.classification.LogisticRegressionModel
  7 | import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
  8 | import org.apache.spark.mllib.feature._
  9 | import org.apache.spark.mllib.linalg.Vectors
 10 | import org.apache.spark.mllib.linalg.{Vector => SparkVector}
 11 | import org.apache.spark.mllib.regression.LabeledPoint
 12 | import org.apache.spark.rdd.RDD
 13 | 
 14 | import com.highperformancespark.examples.dataframe._
 15 | //end::imports[]
 16 | 
 17 | object GoldilocksMLlib {
 18 | 
 19 |   def booleanToDouble(boolean: Boolean): Double = {
 20 |     if (boolean) 1.0 else 0.0
 21 |   }
 22 | 
 23 |   def toLabeledPointDense(rdd: RDD[RawPanda]): RDD[LabeledPoint] = {
 24 |     //tag::toLabeledPointDense[]
 25 |     rdd.map(rp =>
 26 |       LabeledPoint(booleanToDouble(rp.happy),
 27 |         Vectors.dense(rp.attributes)))
 28 |     //end::toLabeledPointDense[]
 29 |   }
 30 | 
 31 |   //tag::toSparkVectorDense[]
 32 |   def toSparkVectorDense(input: Array[Double]) = {
 33 |     Vectors.dense(input)
 34 |   }
 35 |   //end::toSparkVectorDense[]
 36 | 
 37 |   //tag::selectTopTen[]
 38 |   def selectTopTenFeatures(rdd: RDD[LabeledPoint]):
 39 |       (ChiSqSelectorModel, Array[Int], RDD[SparkVector]) = {
 40 |     val selector = new ChiSqSelector(10)
 41 |     val model = selector.fit(rdd)
 42 |     val topFeatures = model.selectedFeatures
 43 |     val vecs = rdd.map(_.features)
 44 |     (model, topFeatures, model.transform(vecs))
 45 |   }
 46 |   //end::selectTopTen[]
 47 | 
 48 |   //tag::keepLabeled[]
 49 |   def selectAndKeepLabeled(rdd: RDD[LabeledPoint]): RDD[LabeledPoint] = {
 50 |     val selector = new ChiSqSelector(10)
 51 |     val model = selector.fit(rdd)
 52 |     rdd.map{
 53 |       case LabeledPoint(label, features) =>
 54 |         LabeledPoint(label, model.transform(features))
 55 |     }
 56 |   }
 57 |   //end::keepLabeled[]
 58 | 
 59 |   //tag::createLabelLookup[]
 60 |   def createLabelLookup[T](rdd: RDD[T]): Map[T, Double] = {
 61 |     val distinctLabels: Array[T] = rdd.distinct().collect()
 62 |     distinctLabels.zipWithIndex
 63 |       .map{case (label, x) => (label, x.toDouble)}.toMap
 64 |   }
 65 |   //end::createLabelLookup[]
 66 | 
 67 | 
 68 |   //tag::hashingTFSimple[]
 69 |   def hashingTf(rdd: RDD[String]): RDD[SparkVector] = {
 70 |     val ht = new HashingTF()
 71 |     val tokenized = rdd.map(_.split(" ").toIterable)
 72 |     ht.transform(tokenized)
 73 |   }
 74 |   //end::hashingTFSimple[]
 75 | 
 76 |   //tag::word2vecTrain[]
 77 |   def word2vecTrain(rdd: RDD[String]): Word2VecModel = {
 78 |     // Tokenize our data
 79 |     val tokenized = rdd.map(_.split(" ").toIterable)
 80 |     // Construct our word2vec model
 81 |     val wv = new Word2Vec()
 82 |     wv.fit(tokenized)
 83 |   }
 84 |   //end::word2vecTrain[]
 85 | 
 86 | 
 87 |   //tag::trainScaler[]
 88 |   // Trains a feature scaler and returns the scaler and scaled features
 89 |   def trainScaler(rdd: RDD[SparkVector]): (StandardScalerModel, RDD[SparkVector]) = {
 90 |     val scaler = new StandardScaler()
 91 |     val scalerModel = scaler.fit(rdd)
 92 |     (scalerModel, scalerModel.transform(rdd))
 93 |   }
 94 |   //end::trainScaler[]
 95 | 
 96 |   //tag::hashingTFPreserve[]
 97 |   def toVectorPerserving(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = {
 98 |     val ht = new HashingTF()
 99 |     rdd.map{panda =>
100 |       val textField = panda.pt
101 |       val tokenizedTextField = textField.split(" ").toIterable
102 |       (panda, ht.transform(tokenizedTextField))
103 |     }
104 |   }
105 |   //end::hashingTFPreserve[]
106 | 
107 |   //tag::hashingTFPreserveZip[]
108 |   def hashingTFPreserveZip(rdd: RDD[RawPanda]): RDD[(RawPanda, SparkVector)] = {
109 |     val ht = new HashingTF()
110 |     val tokenized = rdd.map{panda => panda.pt.split(" ").toIterable}
111 |     val vecs = ht.transform(tokenized)
112 |     rdd.zip(vecs)
113 |   }
114 |   //end::hashingTFPreserveZip[]
115 | 
116 |   //tag::toLabeledPointWithHashing[]
117 |   def toLabeledPointWithHashing(rdd: RDD[RawPanda]): RDD[LabeledPoint] = {
118 |     val ht = new HashingTF()
119 |     rdd.map{rp =>
120 |       val hashingVec = ht.transform(rp.pt)
121 |       val combined = hashingVec.toArray ++ rp.attributes
122 |       LabeledPoint(booleanToDouble(rp.happy),
123 |         Vectors.dense(combined))
124 |     }
125 |   }
126 |   //end::toLabeledPointWithHashing[]
127 | 
128 |   //tag::train[]
129 |   def trainModel(rdd: RDD[LabeledPoint]): LogisticRegressionModel = {
130 |     val lr = new LogisticRegressionWithLBFGS()
131 |     val lrModel = lr.run(rdd)
132 |     lrModel
133 |   }
134 |   //end::train[]
135 | 
136 |   //tag::trainWithIntercept[]
137 |   def trainModelWithInterept(rdd: RDD[LabeledPoint]): LogisticRegressionModel = {
138 |     val lr = new LogisticRegressionWithLBFGS()
139 |     lr.setIntercept(true)
140 |     val lrModel = lr.run(rdd)
141 |     lrModel
142 |   }
143 |   //end::trainWithIntercept[]
144 | 
145 |   //tag::predict[]
146 |   def predict(model: LogisticRegressionModel, rdd: RDD[SparkVector]): RDD[Double] = {
147 |     model.predict(rdd)
148 |   }
149 |   //end::predict[]
150 | 
151 |   //tag::save[]
152 |   def save(sc: SparkContext, path: String, model: LogisticRegressionModel) = {
153 |     //tag::savePMML[]
154 |     // Save to PMML - remote path
155 |     model.toPMML(sc, path + "/pmml")
156 |     // Save to PMML local path
157 |     model.toPMML(path + "/pmml")
158 |     //end::savePMML[]
159 |     //tag::saveInternal[]
160 |     // Save to internal - remote path
161 |     model.save(sc, path + "/internal")
162 |     //end::saveInternal[]
163 |   }
164 |   //end::save[]
165 | 
166 |   //tag::load[]
167 |   def load(sc: SparkContext, path: String): LogisticRegressionModel = {
168 |     LogisticRegressionModel.load(sc, path + "/internal")
169 |   }
170 |   //end::load[]
171 | }
172 | 


--------------------------------------------------------------------------------
/core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala:
--------------------------------------------------------------------------------
  1 | package com.highperformancespark.examples.ml
  2 | 
  3 | import scala.collection.Map
  4 | 
  5 | import org.apache.spark._
  6 | import org.apache.spark.ml._
  7 | import org.apache.spark.ml.classification._
  8 | import org.apache.spark.ml.linalg._
  9 | import org.apache.spark.ml.param._
 10 | import org.apache.spark.ml.util.Identifiable
 11 | import org.apache.spark.rdd.RDD
 12 | import org.apache.spark.sql._
 13 | import org.apache.spark.sql._
 14 | import org.apache.spark.sql.functions._
 15 | import org.apache.spark.sql.types._
 16 | 
 17 | import com.highperformancespark.examples.dataframe._
 18 | //end::extraImports[]
 19 | 
 20 | //tag::basicPipelineSetup[]
 21 | class HardCodedWordCountStage(override val uid: String) extends Transformer {
 22 |   def this() = this(Identifiable.randomUID("hardcodedwordcount"))
 23 | 
 24 |   def copy(extra: ParamMap): HardCodedWordCountStage = {
 25 |     defaultCopy(extra)
 26 |   }
 27 | //end::basicPipelineSetup[]
 28 | 
 29 |   //tag::basicTransformSchema[]
 30 |   override def transformSchema(schema: StructType): StructType = {
 31 |     // Check that the input type is a string
 32 |     val idx = schema.fieldIndex("happy_pandas")
 33 |     val field = schema.fields(idx)
 34 |     if (field.dataType != StringType) {
 35 |       throw new Exception(
 36 |         s"Input type ${field.dataType} did not match input type StringType")
 37 |     }
 38 |     // Add the return field
 39 |     schema.add(StructField("happy_panda_counts", IntegerType, false))
 40 |   }
 41 |   //end::basicTransformSchema[]
 42 | 
 43 |   //tag::transformFunction[]
 44 |   def transform(df: Dataset[_]): DataFrame = {
 45 |     val wordcount = udf { in: String => in.split(" ").size }
 46 |     df.select(col("*"),
 47 |       wordcount(df.col("happy_pandas")).as("happy_panda_counts"))
 48 |   }
 49 |   //end::transformFunction[]
 50 | }
 51 | 
 52 | 
 53 | //tag::paramTransformer[]
 54 | class ConfigurableWordCount(override val uid: String) extends Transformer {
 55 |   final val inputCol= new Param[String](this, "inputCol", "The input column")
 56 |   final val outputCol = new Param[String](this, "outputCol", "The output column")
 57 | 
 58 |   def setInputCol(value: String): this.type = set(inputCol, value)
 59 | 
 60 |   def setOutputCol(value: String): this.type = set(outputCol, value)
 61 | 
 62 |   def this() = this(Identifiable.randomUID("configurablewordcount"))
 63 | 
 64 |   def copy(extra: ParamMap): HardCodedWordCountStage = {
 65 |     defaultCopy(extra)
 66 |   }
 67 | 
 68 |   override def transformSchema(schema: StructType): StructType = {
 69 |     // Check that the input type is a string
 70 |     val idx = schema.fieldIndex($(inputCol))
 71 |     val field = schema.fields(idx)
 72 |     if (field.dataType != StringType) {
 73 |       throw new Exception(
 74 |         s"Input type ${field.dataType} did not match input type StringType")
 75 |     }
 76 |     // Add the return field
 77 |     schema.add(StructField($(outputCol), IntegerType, false))
 78 |   }
 79 | 
 80 |   def transform(df: Dataset[_]): DataFrame = {
 81 |     val wordcount = udf { in: String => in.split(" ").size }
 82 |     df.select(col("*"), wordcount(df.col($(inputCol))).as($(outputCol)))
 83 |   }
 84 | }
 85 | //end::paramTransformer[]
 86 | 
 87 | 
 88 | //tag::simpleIndexer[]
 89 | trait SimpleIndexerParams extends Params {
 90 |   final val inputCol= new Param[String](this, "inputCol", "The input column")
 91 |   final val outputCol = new Param[String](this, "outputCol", "The output column")
 92 | }
 93 | 
 94 | class SimpleIndexer(override val uid: String)
 95 |     extends Estimator[SimpleIndexerModel] with SimpleIndexerParams {
 96 | 
 97 |   def setInputCol(value: String) = set(inputCol, value)
 98 | 
 99 |   def setOutputCol(value: String) = set(outputCol, value)
100 | 
101 |   def this() = this(Identifiable.randomUID("simpleindexer"))
102 | 
103 |   override def copy(extra: ParamMap): SimpleIndexer = {
104 |     defaultCopy(extra)
105 |   }
106 | 
107 |   override def transformSchema(schema: StructType): StructType = {
108 |     // Check that the input type is a string
109 |     val idx = schema.fieldIndex($(inputCol))
110 |     val field = schema.fields(idx)
111 |     if (field.dataType != StringType) {
112 |       throw new Exception(
113 |         s"Input type ${field.dataType} did not match input type StringType")
114 |     }
115 |     // Add the return field
116 |     schema.add(StructField($(outputCol), IntegerType, false))
117 |   }
118 | 
119 |   override def fit(dataset: Dataset[_]): SimpleIndexerModel = {
120 |     import dataset.sparkSession.implicits._
121 |     val words = dataset.select(dataset($(inputCol)).as[String]).distinct
122 |       .collect()
123 |     // Construct the model
124 |     val model = new SimpleIndexerModel(uid, words)
125 |     // Copy the parameters to the model
126 |     copyValues(model)
127 |   }
128 | }
129 | 
130 | class SimpleIndexerModel(override val uid: String, words: Array[String])
131 |     extends Model[SimpleIndexerModel] with SimpleIndexerParams {
132 | 
133 |   override def copy(extra: ParamMap): SimpleIndexerModel = {
134 |     defaultCopy(extra)
135 |   }
136 | 
137 |   private val labelToIndex: Map[String, Double] = words.zipWithIndex.
138 |     map{case (x, y) => (x, y.toDouble)}.toMap
139 | 
140 |   override def transformSchema(schema: StructType): StructType = {
141 |     // Check that the input type is a string
142 |     val idx = schema.fieldIndex($(inputCol))
143 |     val field = schema.fields(idx)
144 |     if (field.dataType != StringType) {
145 |       throw new Exception(
146 |         s"Input type ${field.dataType} did not match input type StringType")
147 |     }
148 |     // Add the return field
149 |     schema.add(StructField($(outputCol), IntegerType, false))
150 |   }
151 | 
152 |   override def transform(dataset: Dataset[_]): DataFrame = {
153 |     val indexer = udf { label: String => labelToIndex(label) }
154 |     dataset.select(col("*"),
155 |       indexer(dataset($(inputCol)).cast(StringType)).as($(outputCol)))
156 |   }
157 | }
158 | //end::SimpleIndexer[]
159 | 


--------------------------------------------------------------------------------