├── .github
    ├── actions
    │   ├── build-whl
    │   │   └── action.yml
    │   ├── build
    │   │   └── action.yml
    │   ├── check-compat
    │   │   └── action.yml
    │   ├── prime-caches
    │   │   └── action.yml
    │   ├── test-jvm
    │   │   └── action.yml
    │   └── test-python
    │   │   └── action.yml
    ├── dependabot.yml
    └── workflows
    │   ├── build-jvm.yml
    │   ├── build-python.yml
    │   ├── build-snapshots.yml
    │   ├── check.yml
    │   ├── ci.yml
    │   ├── clear-caches.yaml
    │   ├── prepare-release.yml
    │   ├── prime-caches.yml
    │   ├── publish-release.yml
    │   ├── publish-snapshot.yml
    │   ├── test-jvm.yml
    │   ├── test-python.yml
    │   ├── test-results.yml
    │   └── test-snapshots.yml
├── .gitignore
├── .scalafmt.conf
├── CHANGELOG.md
├── CONDITIONAL.md
├── DIFF.md
├── GROUPS.md
├── HISTOGRAM.md
├── LICENSE
├── MAINTAINERS.md
├── PARQUET.md
├── PARTITIONING.md
├── PYSPARK-DEPS.md
├── README.md
├── RELEASE.md
├── ROW_NUMBER.md
├── SECURITY.md
├── build-whl.sh
├── bump-version.sh
├── examples
    └── python-deps
    │   ├── Dockerfile
    │   ├── docker-compose.yml
    │   └── example.py
├── pom.xml
├── python
    ├── README.md
    ├── gresearch
    │   ├── __init__.py
    │   └── spark
    │   │   ├── __init__.py
    │   │   ├── diff
    │   │       ├── __init__.py
    │   │       └── comparator
    │   │       │   └── __init__.py
    │   │   └── parquet
    │   │       └── __init__.py
    ├── pyproject.toml
    ├── requirements-3.0_2.12.txt
    ├── requirements-3.1_2.12.txt
    ├── requirements-3.2_2.12.txt
    ├── requirements-3.2_2.13.txt
    ├── requirements-3.3_2.12.txt
    ├── requirements-3.3_2.13.txt
    ├── requirements-3.4_2.12.txt
    ├── requirements-3.4_2.13.txt
    ├── requirements-3.5_2.12.txt
    ├── requirements-3.5_2.13.txt
    ├── requirements-4.0_2.13.txt
    ├── requirements-4.1_2.13.txt
    ├── setup.py
    └── test
    │   ├── __init__.py
    │   ├── requirements.txt
    │   ├── spark_common.py
    │   ├── test_diff.py
    │   ├── test_histogram.py
    │   ├── test_job_description.py
    │   ├── test_jvm.py
    │   ├── test_package.py
    │   ├── test_parquet.py
    │   └── test_row_number.py
├── release.sh
├── set-version.sh
├── src
    ├── main
    │   ├── scala-spark-3.0
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── package.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-3.1
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── package.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-3.2
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── package.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-3.3
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── package.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-3.4
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── package.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-3.5
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── package.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-4.0
    │   │   ├── org
    │   │   │   └── apache
    │   │   │   │   └── spark
    │   │   │   │       └── sql
    │   │   │   │           └── extension
    │   │   │   │               └── extension.scala
    │   │   └── uk
    │   │   │   └── co
    │   │   │       └── gresearch
    │   │   │           └── spark
    │   │   │               ├── Backticks.scala
    │   │   │               ├── BinaryLikeWithNewChildrenInternal.scala
    │   │   │               ├── UnixMicros.scala
    │   │   │               └── parquet
    │   │   │                   ├── BlockMetaDataUtil.scala
    │   │   │                   ├── FileMetaDataUtil.scala
    │   │   │                   ├── PrimitiveTypeUtil.scala
    │   │   │                   └── SplitFile.scala
    │   ├── scala-spark-4.1
    │   └── scala
    │   │   └── uk
    │   │       └── co
    │   │           └── gresearch
    │   │               ├── package.scala
    │   │               └── spark
    │   │                   ├── BuildVersion.scala
    │   │                   ├── Histogram.scala
    │   │                   ├── RowNumbers.scala
    │   │                   ├── SparkVersion.scala
    │   │                   ├── UnpersistHandle.scala
    │   │                   ├── diff
    │   │                       ├── App.scala
    │   │                       ├── Diff.scala
    │   │                       ├── DiffComparators.scala
    │   │                       ├── DiffOptions.scala
    │   │                       ├── comparator
    │   │                       │   ├── DefaultDiffComparator.scala
    │   │                       │   ├── DiffComparator.scala
    │   │                       │   ├── DurationDiffComparator.scala
    │   │                       │   ├── EpsilonDiffComparator.scala
    │   │                       │   ├── EquivDiffComparator.scala
    │   │                       │   ├── MapDiffComparator.scala
    │   │                       │   ├── NullSafeEqualDiffComparator.scala
    │   │                       │   ├── TypedDiffComparator.scala
    │   │                       │   └── WhitespaceDiffComparator.scala
    │   │                       └── package.scala
    │   │                   ├── group
    │   │                       └── package.scala
    │   │                   ├── package.scala
    │   │                   └── parquet
    │   │                       └── package.scala
    └── test
    │   ├── files
    │       ├── nested.parquet
    │       └── test.parquet
    │       │   ├── file1.parquet
    │       │   └── file2.parquet
    │   ├── java
    │       └── uk
    │       │   └── co
    │       │       └── gresearch
    │       │           └── test
    │       │               ├── SparkJavaTests.java
    │       │               └── diff
    │       │                   ├── DiffJavaTests.java
    │       │                   ├── JavaValue.java
    │       │                   └── JavaValueAs.java
    │   ├── resources
    │       ├── log4j.properties
    │       └── log4j2.properties
    │   ├── scala-spark-3
    │       └── uk
    │       │   └── co
    │       │       └── gresearch
    │       │           └── spark
    │       │               └── SparkSuiteHelper.scala
    │   ├── scala-spark-4
    │       └── uk
    │       │   └── co
    │       │       └── gresearch
    │       │           └── spark
    │       │               └── SparkSuiteHelper.scala
    │   └── scala
    │       └── uk
    │           └── co
    │               └── gresearch
    │                   └── spark
    │                       ├── GroupBySuite.scala
    │                       ├── HistogramSuite.scala
    │                       ├── SparkSuite.scala
    │                       ├── SparkTestSession.scala
    │                       ├── WritePartitionedSuite.scala
    │                       ├── diff
    │                           ├── AppSuite.scala
    │                           ├── DiffComparatorSuite.scala
    │                           ├── DiffOptionsSuite.scala
    │                           ├── DiffSuite.scala
    │                           └── examples
    │                           │   └── Examples.scala
    │                       ├── group
    │                           └── GroupSuite.scala
    │                       ├── parquet
    │                           └── ParquetSuite.scala
    │                       └── test
    │                           └── package.scala
├── test-release.py
├── test-release.scala
├── test-release.sh
├── with-job-description.png
└── without-job-description.png


/.github/actions/build-whl/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Build Whl'
 2 | author: 'EnricoMi'
 3 | description: 'A GitHub Action that builds pyspark-extension package'
 4 | 
 5 | inputs:
 6 |   spark-version:
 7 |     description: Spark version, e.g. 3.4.0, 3.4.0-SNAPSHOT, or 4.0.0-preview1
 8 |     required: true
 9 |   scala-version:
10 |     description: Scala version, e.g. 2.12.15
11 |     required: true
12 |   spark-compat-version:
13 |     description: Spark compatibility version, e.g. 3.4
14 |     required: true
15 |   scala-compat-version:
16 |     description: Scala compatibility version, e.g. 2.12
17 |     required: true
18 |   java-compat-version:
19 |     description: Java compatibility version, e.g. 8
20 |     required: true
21 |   python-version:
22 |     description: Python version, e.g. 3.8
23 |     required: true
24 | 
25 | runs:
26 |   using: 'composite'
27 |   steps:
28 |   - name: Fetch Binaries Artifact
29 |     uses: actions/download-artifact@v4
30 |     with:
31 |       name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
32 |       path: .
33 | 
34 |   - name: Set versions in pom.xml
35 |     run: |
36 |       ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }}
37 |       git diff
38 |     shell: bash
39 | 
40 |   - name: Restore Maven packages cache
41 |     if: github.event_name != 'schedule'
42 |     uses: actions/cache/restore@v4
43 |     with:
44 |       path: ~/.m2/repository
45 |       key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
46 |       restore-keys:
47 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
48 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-
49 | 
50 |   - name: Setup JDK ${{ inputs.java-compat-version }}
51 |     uses: actions/setup-java@v4
52 |     with:
53 |       java-version: ${{ inputs.java-compat-version }}
54 |       distribution: 'zulu'
55 | 
56 |   - name: Setup Python
57 |     uses: actions/setup-python@v5
58 |     with:
59 |       python-version: ${{ inputs.python-version }}
60 | 
61 |   - name: Install Python dependencies
62 |     run: |
63 |       # Install Python dependencies
64 |       echo "::group::mvn compile"
65 |       python -m pip install --upgrade pip build twine
66 |       echo "::endgroup::"
67 |     shell: bash
68 | 
69 |   - name: Build whl
70 |     run: |
71 |       # Build whl
72 |       echo "::group::build-whl.sh"
73 |       ./build-whl.sh
74 |       echo "::endgroup::"
75 |     shell: bash
76 | 
77 |   - name: Test whl
78 |     run: |
79 |       # Test whl
80 |       echo "::group::test-release.py"
81 |       twine check python/dist/*
82 |       pip install -r python/requirements-${{ inputs.spark-compat-version }}_${{ inputs.scala-compat-version }}.txt
83 |       pip install python/dist/*.whl
84 |       python test-release.py
85 |       echo "::endgroup::"
86 |     shell: bash
87 | 
88 |   - name: Upload whl
89 |     uses: actions/upload-artifact@v4
90 |     with:
91 |       name: Whl (Spark ${{ inputs.spark-compat-version }} Scala ${{ inputs.scala-compat-version }})
92 |       path: |
93 |         python/dist/*.whl
94 | 
95 | branding:
96 |   icon: 'check-circle'
97 |   color: 'green'
98 | 


--------------------------------------------------------------------------------
/.github/actions/build/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Build'
 2 | author: 'EnricoMi'
 3 | description: 'A GitHub Action that builds spark-extension'
 4 | 
 5 | inputs:
 6 |   spark-version:
 7 |     description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT
 8 |     required: true
 9 |   scala-version:
10 |     description: Scala version, e.g. 2.12.15
11 |     required: true
12 |   spark-compat-version:
13 |     description: Spark compatibility version, e.g. 3.4
14 |     required: true
15 |   scala-compat-version:
16 |     description: Scala compatibility version, e.g. 2.12
17 |     required: true
18 |   java-compat-version:
19 |     description: Java compatibility version, e.g. 8
20 |     required: true
21 |   hadoop-version:
22 |     description: Hadoop version, e.g. 2.7 or 2
23 |     required: true
24 | 
25 | runs:
26 |   using: 'composite'
27 |   steps:
28 |   - name: Set versions in pom.xml
29 |     run: |
30 |       ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }}
31 |       git diff
32 |     shell: bash
33 | 
34 |   - name: Restore Maven packages cache
35 |     if: github.event_name != 'schedule'
36 |     uses: actions/cache/restore@v4
37 |     with:
38 |       path: ~/.m2/repository
39 |       key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
40 |       restore-keys:
41 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
42 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-
43 | 
44 |   - name: Setup JDK ${{ inputs.java-compat-version }}
45 |     uses: actions/setup-java@v4
46 |     with:
47 |       java-version: ${{ inputs.java-compat-version }}
48 |       distribution: 'zulu'
49 | 
50 |   - name: Build
51 |     env:
52 |       JDK_JAVA_OPTIONS: --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.base/sun.util.calendar=ALL-UNNAMED
53 |     run: |
54 |       # Build
55 |       echo "::group::mvn compile"
56 |       mvn --batch-mode --update-snapshots -Dspotless.check.skip clean compile test-compile
57 |       echo "::endgroup::"
58 | 
59 |       echo "::group::mvn package"
60 |       mvn --batch-mode package -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true
61 |       echo "::endgroup::"
62 | 
63 |       echo "::group::mvn install"
64 |       mvn --batch-mode install -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true -Dgpg.skip
65 |       echo "::endgroup::"
66 |     shell: bash
67 | 
68 |   - name: Upload Binaries
69 |     uses: actions/upload-artifact@v4
70 |     with:
71 |       name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
72 |       path: |
73 |         *
74 |         !.*
75 |         !target/*-javadoc.jar
76 |         !target/*-sources.jar
77 |         !target/site
78 | 
79 | branding:
80 |   icon: 'check-circle'
81 |   color: 'green'
82 | 


--------------------------------------------------------------------------------
/.github/actions/check-compat/action.yml:
--------------------------------------------------------------------------------
  1 | name: 'Check'
  2 | author: 'EnricoMi'
  3 | description: 'A GitHub Action that checks compatibility of spark-extension'
  4 | 
  5 | inputs:
  6 |   spark-version:
  7 |     description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT
  8 |     required: true
  9 |   scala-version:
 10 |     description: Scala version, e.g. 2.12.15
 11 |     required: true
 12 |   spark-compat-version:
 13 |     description: Spark compatibility version, e.g. 3.4
 14 |     required: true
 15 |   scala-compat-version:
 16 |     description: Scala compatibility version, e.g. 2.12
 17 |     required: true
 18 |   package-version:
 19 |     description: Spark-Extension version to check against
 20 |     required: true
 21 | 
 22 | runs:
 23 |   using: 'composite'
 24 |   steps:
 25 |   - name: Fetch Binaries Artifact
 26 |     uses: actions/download-artifact@v4
 27 |     with:
 28 |       name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
 29 |       path: .
 30 | 
 31 |   - name: Set versions in pom.xml
 32 |     run: |
 33 |       ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }}
 34 |       git diff
 35 |     shell: bash
 36 | 
 37 |   - name: Restore Maven packages cache
 38 |     if: github.event_name != 'schedule'
 39 |     uses: actions/cache/restore@v4
 40 |     with:
 41 |       path: ~/.m2/repository
 42 |       key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
 43 |       restore-keys:
 44 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
 45 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-
 46 | 
 47 |   - name: Setup JDK 1.8
 48 |     uses: actions/setup-java@v4
 49 |     with:
 50 |       java-version: '8'
 51 |       distribution: 'zulu'
 52 | 
 53 |   - name: Install Checker
 54 |     run: |
 55 |       # Install Checker
 56 |       echo "::group::apt update install"
 57 |       sudo apt update
 58 |       sudo apt install japi-compliance-checker
 59 |       echo "::endgroup::"
 60 |     shell: bash
 61 | 
 62 |   - name: Release exists
 63 |     id: exists
 64 |     continue-on-error: true
 65 |     run: |
 66 |       # Release exists
 67 |       curl --head --fail https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar
 68 |     shell: bash
 69 | 
 70 |   - name: Fetch package
 71 |     if: steps.exists.outcome == 'success'
 72 |     run: |
 73 |       # Fetch package
 74 |       echo "::group::mvn dependency:get"
 75 |       mvn dependency:get -Dtransitive=false -DremoteRepositories -Dartifact=uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:${{ inputs.package-version }}-${{ inputs.spark-compat-version }}
 76 |       echo "::endgroup::"
 77 |     shell: bash
 78 | 
 79 |   - name: Check
 80 |     if: steps.exists.outcome == 'success'
 81 |     continue-on-error: ${{ github.ref == 'refs/heads/master' }}
 82 |     run: |
 83 |       # Check
 84 |       echo "::group::japi-compliance-checker"
 85 |       ls -lah ~/.m2/repository/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar target/spark-extension*.jar
 86 |       japi-compliance-checker ~/.m2/repository/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar target/spark-extension*.jar
 87 |       echo "::endgroup::"
 88 |     shell: bash
 89 | 
 90 |   - name: Upload Report
 91 |     uses: actions/upload-artifact@v4
 92 |     if: always() && steps.exists.outcome == 'success'
 93 |     with:
 94 |       name: Compat-Report-${{ inputs.spark-compat-version }}
 95 |       path: compat_reports/spark-extension/*
 96 | 
 97 | branding:
 98 |   icon: 'check-circle'
 99 |   color: 'green'
100 | 


--------------------------------------------------------------------------------
/.github/actions/prime-caches/action.yml:
--------------------------------------------------------------------------------
 1 | name: 'Prime caches'
 2 | author: 'EnricoMi'
 3 | description: 'A GitHub Action that primes caches'
 4 | 
 5 | inputs:
 6 |   spark-version:
 7 |     description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT
 8 |     required: true
 9 |   scala-version:
10 |     description: Scala version, e.g. 2.12.15
11 |     required: true
12 |   spark-compat-version:
13 |     description: Spark compatibility version, e.g. 3.4
14 |     required: true
15 |   scala-compat-version:
16 |     description: Scala compatibility version, e.g. 2.12
17 |     required: true
18 |   java-compat-version:
19 |     description: Java compatibility version, e.g. 8
20 |     required: true
21 |   hadoop-version:
22 |     description: Hadoop version, e.g. 2.7 or 2
23 |     required: true
24 | 
25 | runs:
26 |   using: 'composite'
27 |   steps:
28 |   - name: Set versions in pom.xml
29 |     run: |
30 |       ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }}
31 |       git diff
32 |     shell: bash
33 | 
34 |   - name: Setup JDK ${{ inputs.java-compat-version }}
35 |     uses: actions/setup-java@v4
36 |     with:
37 |       java-version: ${{ inputs.java-compat-version }}
38 |       distribution: 'zulu'
39 | 
40 |   - name: Build
41 |     env:
42 |       JDK_JAVA_OPTIONS: --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.base/sun.util.calendar=ALL-UNNAMED
43 |     run: |
44 |       # Build
45 |       echo "::group::mvn dependency:go-offline"
46 |       mvn --batch-mode dependency:go-offline
47 |       echo "::endgroup::"
48 |     shell: bash
49 | 
50 |   - name: Save Maven packages cache
51 |     uses: actions/cache/save@v4
52 |     with:
53 |       path: ~/.m2/repository
54 |       key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}-${{ github.run_id }}
55 | 
56 |   - name: Setup Spark Binaries
57 |     if: ( ! contains(inputs.spark-version, '-SNAPSHOT') )
58 |     env:
59 |       SPARK_PACKAGE: spark-${{ inputs.spark-version }}/spark-${{ inputs.spark-version }}-bin-hadoop${{ inputs.hadoop-version }}${{ inputs.scala-compat-version == '2.13' && '-scala2.13' || '' }}.tgz
60 |     run: |
61 |       wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/${SPARK_PACKAGE}?action=download" -O - | tar -xzC "${{ runner.temp }}"
62 |       archive=$(basename "${SPARK_PACKAGE}") bash -c "mv -v "${{ runner.temp }}/\${archive/%.tgz/}" ~/spark"
63 |     shell: bash
64 | 
65 |   - name: Save Spark Binaries cache
66 |     if: ( ! contains(inputs.spark-version, '-SNAPSHOT') )
67 |     uses: actions/cache/save@v4
68 |     with:
69 |       path: ~/spark
70 |       key: ${{ runner.os }}-spark-binaries-${{ inputs.spark-version }}-${{ inputs.scala-compat-version }}-${{ github.run_id }}
71 | 
72 | branding:
73 |   icon: 'check-circle'
74 |   color: 'green'
75 | 


--------------------------------------------------------------------------------
/.github/actions/test-jvm/action.yml:
--------------------------------------------------------------------------------
  1 | name: 'Test JVM'
  2 | author: 'EnricoMi'
  3 | description: 'A GitHub Action that tests JVM spark-extension'
  4 | 
  5 | inputs:
  6 |   spark-version:
  7 |     description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT
  8 |     required: true
  9 |   scala-version:
 10 |     description: Scala version, e.g. 2.12.15
 11 |     required: true
 12 |   spark-compat-version:
 13 |     description: Spark compatibility version, e.g. 3.4
 14 |     required: true
 15 |   scala-compat-version:
 16 |     description: Scala compatibility version, e.g. 2.12
 17 |     required: true
 18 |   hadoop-version:
 19 |     description: Hadoop version, e.g. 2.7 or 2
 20 |     required: true
 21 |   java-compat-version:
 22 |     description: Java compatibility version, e.g. 8
 23 |     required: true
 24 | 
 25 | runs:
 26 |   using: 'composite'
 27 |   steps:
 28 |   - name: Fetch Binaries Artifact
 29 |     uses: actions/download-artifact@v4
 30 |     with:
 31 |       name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }}
 32 |       path: .
 33 | 
 34 |   - name: Set versions in pom.xml
 35 |     run: |
 36 |       ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }}
 37 |       git diff
 38 |     shell: bash
 39 | 
 40 |   - name: Restore Spark Binaries cache
 41 |     if: github.event_name != 'schedule' && ! contains(inputs.spark-version, '-SNAPSHOT')
 42 |     uses: actions/cache/restore@v4
 43 |     with:
 44 |       path: ~/spark
 45 |       key: ${{ runner.os }}-spark-binaries-${{ inputs.spark-version }}-${{ inputs.scala-compat-version }}
 46 |       restore-keys:
 47 |         ${{ runner.os }}-spark-binaries-${{ inputs.spark-version }}-${{ inputs.scala-compat-version }}
 48 | 
 49 |   - name: Setup Spark Binaries
 50 |     if: ( ! contains(inputs.spark-version, '-SNAPSHOT') )
 51 |     env:
 52 |       SPARK_PACKAGE: spark-${{ inputs.spark-version }}/spark-${{ inputs.spark-version }}-bin-hadoop${{ inputs.hadoop-version }}${{ inputs.scala-compat-version == '2.13' && '-scala2.13' || '' }}.tgz
 53 |     run: |
 54 |       # Setup Spark Binaries
 55 |       if [[ ! -e ~/spark ]]
 56 |       then
 57 |         wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/${SPARK_PACKAGE}?action=download" -O - | tar -xzC "${{ runner.temp }}"
 58 |         archive=$(basename "${SPARK_PACKAGE}") bash -c "mv -v "${{ runner.temp }}/\${archive/%.tgz/}" ~/spark"
 59 |       fi
 60 |       echo "SPARK_HOME=$(cd ~/spark; pwd)" >> $GITHUB_ENV
 61 |     shell: bash
 62 | 
 63 |   - name: Restore Maven packages cache
 64 |     if: github.event_name != 'schedule'
 65 |     uses: actions/cache/restore@v4
 66 |     with:
 67 |       path: ~/.m2/repository
 68 |       key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
 69 |       restore-keys:
 70 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}
 71 |         ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-
 72 | 
 73 |   - name: Setup JDK ${{ inputs.java-compat-version }}
 74 |     uses: actions/setup-java@v4
 75 |     with:
 76 |       java-version: ${{ inputs.java-compat-version }}
 77 |       distribution: 'zulu'
 78 | 
 79 |   - name: Scala and Java Tests
 80 |     env:
 81 |       JDK_JAVA_OPTIONS: --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.base/sun.util.calendar=ALL-UNNAMED
 82 |     run: |
 83 |       # Scala and Java Tests
 84 |       echo "::group::mvn test"
 85 |       mvn --batch-mode --update-snapshots -Dspotless.check.skip test
 86 |       echo "::endgroup::"
 87 |     shell: bash
 88 | 
 89 |   - name: Diff App test
 90 |     if: ( ! contains(inputs.spark-version, '-SNAPSHOT') )
 91 |     run: |
 92 |       # Diff App test
 93 |       echo "::group::spark-submit"
 94 |       $SPARK_HOME/bin/spark-submit --packages com.github.scopt:scopt_${{ inputs.scala-compat-version }}:4.1.0 target/spark-extension_*.jar --format parquet --id id src/test/files/test.parquet/file1.parquet src/test/files/test.parquet/file2.parquet diff.parquet
 95 |       echo "::endgroup::"
 96 | 
 97 |       echo "::group::spark-shell"
 98 |       $SPARK_HOME/bin/spark-shell <<< 'val df = spark.read.parquet("diff.parquet").orderBy($"id").groupBy($"diff").count; df.show; if (df.count != 2) sys.exit(1)'
 99 |       echo "::endgroup::"
100 |     shell: bash
101 | 
102 |   - name: Generate Unit Test Report
103 |     if: failure()
104 |     run: |
105 |       # Generate Unit Test Report
106 |       echo "::group::mvn report-only"
107 |       mvn --batch-mode surefire-report:report-only
108 |       echo "::endgroup::"
109 |     shell: bash
110 | 
111 |   - name: Upload Unit Test Results
112 |     if: always()
113 |     uses: actions/upload-artifact@v4
114 |     with:
115 |       name: JVM Test Results (Spark ${{ inputs.spark-version }} Scala ${{ inputs.scala-version }})
116 |       path: |
117 |         target/surefire-reports/*.xml
118 |         !target/surefire-reports/TEST-org.scalatest*.xml
119 |         target/site/surefire-report.html
120 | 
121 | branding:
122 |   icon: 'check-circle'
123 |   color: 'green'
124 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | updates:
 3 |   - package-ecosystem: "github-actions"
 4 |     directory: "/"
 5 |     schedule:
 6 |       interval: "monthly"
 7 | 
 8 |   - package-ecosystem: "maven"
 9 |     directory: "/"
10 |     schedule:
11 |       interval: "daily"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/build-jvm.yml:
--------------------------------------------------------------------------------
 1 | name: Build JVM
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   build:
 8 |     name: Build (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }})
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         include:
15 |           - spark-compat-version: '3.0'
16 |             spark-version: '3.0.3'
17 |             scala-compat-version: '2.12'
18 |             scala-version: '2.12.10'
19 |             hadoop-version: '2.7'
20 |           - spark-compat-version: '3.1'
21 |             spark-version: '3.1.3'
22 |             scala-compat-version: '2.12'
23 |             scala-version: '2.12.10'
24 |             hadoop-version: '2.7'
25 |           - spark-compat-version: '3.2'
26 |             spark-version: '3.2.4'
27 |             scala-compat-version: '2.12'
28 |             scala-version: '2.12.15'
29 |             hadoop-version: '2.7'
30 |           - spark-compat-version: '3.3'
31 |             spark-version: '3.3.4'
32 |             scala-compat-version: '2.12'
33 |             scala-version: '2.12.15'
34 |             hadoop-version: '3'
35 |           - spark-compat-version: '3.4'
36 |             scala-compat-version: '2.12'
37 |             scala-version: '2.12.17'
38 |             spark-version: '3.4.4'
39 |             hadoop-version: '3'
40 |           - spark-compat-version: '3.5'
41 |             scala-compat-version: '2.12'
42 |             scala-version: '2.12.18'
43 |             spark-version: '3.5.5'
44 |             hadoop-version: '3'
45 | 
46 |           - spark-compat-version: '3.2'
47 |             spark-version: '3.2.4'
48 |             scala-compat-version: '2.13'
49 |             scala-version: '2.13.5'
50 |             hadoop-version: '3.2'
51 |           - spark-compat-version: '3.3'
52 |             spark-version: '3.3.4'
53 |             scala-compat-version: '2.13'
54 |             scala-version: '2.13.8'
55 |             hadoop-version: '3'
56 |           - spark-compat-version: '3.4'
57 |             scala-compat-version: '2.13'
58 |             scala-version: '2.13.8'
59 |             spark-version: '3.4.4'
60 |             hadoop-version: '3'
61 |           - spark-compat-version: '3.5'
62 |             scala-compat-version: '2.13'
63 |             scala-version: '2.13.8'
64 |             spark-version: '3.5.5'
65 |             hadoop-version: '3'
66 | 
67 |     steps:
68 |       - name: Checkout
69 |         uses: actions/checkout@v4
70 | 
71 |       - name: Build
72 |         uses: ./.github/actions/build
73 |         with:
74 |           spark-version: ${{ matrix.spark-version }}
75 |           scala-version: ${{ matrix.scala-version }}
76 |           spark-compat-version: ${{ matrix.spark-compat-version }}
77 |           scala-compat-version: ${{ matrix.scala-compat-version }}
78 |           hadoop-version: ${{ matrix.hadoop-version }}
79 |           java-compat-version: '8'
80 | 


--------------------------------------------------------------------------------
/.github/workflows/build-python.yml:
--------------------------------------------------------------------------------
 1 | name: Build Python
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   # pyspark<4 is not available for snapshots or scala other than 2.12
 8 |   whl:
 9 |     name: Build whl (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }})
10 |     runs-on: ubuntu-latest
11 | 
12 |     strategy:
13 |       fail-fast: false
14 |       matrix:
15 |         include:
16 |           - spark-compat-version: '3.0'
17 |             spark-version: '3.0.3'
18 |             scala-compat-version: '2.12'
19 |             scala-version: '2.12.10'
20 |             java-compat-version: '8'
21 |           - spark-compat-version: '3.1'
22 |             spark-version: '3.1.3'
23 |             scala-compat-version: '2.12'
24 |             scala-version: '2.12.10'
25 |             java-compat-version: '8'
26 |           - spark-compat-version: '3.2'
27 |             spark-version: '3.2.4'
28 |             scala-compat-version: '2.12'
29 |             scala-version: '2.12.15'
30 |             java-compat-version: '8'
31 |           - spark-compat-version: '3.3'
32 |             spark-version: '3.3.4'
33 |             scala-compat-version: '2.12'
34 |             scala-version: '2.12.15'
35 |             java-compat-version: '8'
36 |           - spark-compat-version: '3.4'
37 |             spark-version: '3.4.4'
38 |             scala-compat-version: '2.12'
39 |             scala-version: '2.12.17'
40 |             java-compat-version: '8'
41 |           - spark-compat-version: '3.5'
42 |             spark-version: '3.5.5'
43 |             scala-compat-version: '2.12'
44 |             scala-version: '2.12.18'
45 |             java-compat-version: '8'
46 | 
47 |     steps:
48 |       - name: Checkout
49 |         uses: actions/checkout@v4
50 | 
51 |       - name: Build
52 |         uses: ./.github/actions/build-whl
53 |         with:
54 |           spark-version: ${{ matrix.spark-version }}
55 |           scala-version: ${{ matrix.scala-version }}
56 |           spark-compat-version: ${{ matrix.spark-compat-version }}
57 |           scala-compat-version: ${{ matrix.scala-compat-version }}
58 |           java-compat-version: ${{ matrix.java-compat-version }}
59 |           python-version: "3.9"
60 | 


--------------------------------------------------------------------------------
/.github/workflows/build-snapshots.yml:
--------------------------------------------------------------------------------
 1 | name: Build Snapshots
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   build:
 8 |     name: Build (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }})
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         include:
15 |           - spark-compat-version: '3.2'
16 |             spark-version: '3.2.5-SNAPSHOT'
17 |             scala-compat-version: '2.12'
18 |             scala-version: '2.12.15'
19 |             java-compat-version: '8'
20 |           - spark-compat-version: '3.3'
21 |             spark-version: '3.3.5-SNAPSHOT'
22 |             scala-compat-version: '2.12'
23 |             scala-version: '2.12.15'
24 |             java-compat-version: '8'
25 |           - spark-compat-version: '3.4'
26 |             spark-version: '3.4.5-SNAPSHOT'
27 |             scala-compat-version: '2.12'
28 |             scala-version: '2.12.17'
29 |             java-compat-version: '8'
30 |           - spark-compat-version: '3.5'
31 |             spark-version: '3.5.6-SNAPSHOT'
32 |             scala-compat-version: '2.12'
33 |             scala-version: '2.12.18'
34 |             java-compat-version: '8'
35 | 
36 |           - spark-compat-version: '3.2'
37 |             spark-version: '3.2.5-SNAPSHOT'
38 |             scala-compat-version: '2.13'
39 |             scala-version: '2.13.5'
40 |             java-compat-version: '8'
41 |           - spark-compat-version: '3.3'
42 |             spark-version: '3.3.5-SNAPSHOT'
43 |             scala-compat-version: '2.13'
44 |             scala-version: '2.13.8'
45 |             java-compat-version: '8'
46 |           - spark-compat-version: '3.4'
47 |             spark-version: '3.4.5-SNAPSHOT'
48 |             scala-compat-version: '2.13'
49 |             scala-version: '2.13.8'
50 |             java-compat-version: '8'
51 |           - spark-compat-version: '3.5'
52 |             spark-version: '3.5.6-SNAPSHOT'
53 |             scala-compat-version: '2.13'
54 |             scala-version: '2.13.8'
55 |             java-compat-version: '8'
56 |           - spark-compat-version: '4.0'
57 |             spark-version: '4.0.1-SNAPSHOT'
58 |             scala-compat-version: '2.13'
59 |             scala-version: '2.13.16'
60 |             java-compat-version: '17'
61 |           - spark-compat-version: '4.1'
62 |             spark-version: '4.1.0-SNAPSHOT'
63 |             scala-compat-version: '2.13'
64 |             scala-version: '2.13.16'
65 |             java-compat-version: '17'
66 | 
67 |     steps:
68 |       - name: Checkout
69 |         uses: actions/checkout@v4
70 | 
71 |       - name: Build
72 |         uses: ./.github/actions/build
73 |         with:
74 |           spark-version: ${{ matrix.spark-version }}
75 |           scala-version: ${{ matrix.scala-version }}
76 |           spark-compat-version: ${{ matrix.spark-compat-version }}-SNAPSHOT
77 |           scala-compat-version: ${{ matrix.scala-compat-version }}
78 |           java-compat-version: ${{ matrix.java-compat-version }}
79 | 


--------------------------------------------------------------------------------
/.github/workflows/check.yml:
--------------------------------------------------------------------------------
  1 | name: Check
  2 | 
  3 | on:
  4 |   workflow_call:
  5 | 
  6 | jobs:
  7 |   lint:
  8 |     name: Scala lint
  9 |     runs-on: ubuntu-latest
 10 | 
 11 |     steps:
 12 |       - name: Checkout
 13 |         uses: actions/checkout@v4
 14 |         with:
 15 |           fetch-depth: 0
 16 | 
 17 |       - name: Setup JDK ${{ inputs.java-compat-version }}
 18 |         uses: actions/setup-java@v4
 19 |         with:
 20 |           java-version: '11'
 21 |           distribution: 'zulu'
 22 | 
 23 |       - name: Check
 24 |         id: check
 25 |         run: |
 26 |           mvn --batch-mode --update-snapshots spotless:check
 27 |         shell: bash
 28 | 
 29 |       - name: Changes
 30 |         if: failure() && steps.check.outcome == 'failure'
 31 |         run: |
 32 |           mvn --batch-mode --update-snapshots spotless:apply
 33 |           git diff
 34 |         shell: bash
 35 | 
 36 |   config:
 37 |     name: Configure compat
 38 |     runs-on: ubuntu-latest
 39 |     outputs:
 40 |       major-version: ${{ steps.versions.outputs.major-version }}
 41 |       release-version: ${{ steps.versions.outputs.release-version }}
 42 |       release-major-version: ${{ steps.versions.outputs.release-major-version }}
 43 | 
 44 |     steps:
 45 |       - name: Checkout
 46 |         uses: actions/checkout@v4
 47 |         with:
 48 |           fetch-depth: 0
 49 | 
 50 |       - name: Get versions
 51 |         id: versions
 52 |         run: |
 53 |           version=$(grep -m1 version pom.xml | sed -e "s/<[^>]*>//g" -e "s/ //g")
 54 |           echo "version: $version"
 55 |           echo "major-version: ${version/.*/}"
 56 |           echo "version=$version" >> "$GITHUB_OUTPUT"
 57 |           echo "major-version=${version/.*/}" >> "$GITHUB_OUTPUT"
 58 |           release_version=$(git tag | grep "^v" | sort --version-sort | tail -n1 | sed "s/^v//")
 59 |           echo "release-version: $release_version"
 60 |           echo "release-major-version: ${release_version/.*/}"
 61 |           echo "release-version=$release_version" >> "$GITHUB_OUTPUT"
 62 |           echo "release-major-version=${release_version/.*/}" >> "$GITHUB_OUTPUT"
 63 |         shell: bash
 64 | 
 65 |   compat:
 66 |     name: Compat (Spark ${{ matrix.spark-compat-version }} Scala ${{ matrix.scala-compat-version }})
 67 |     needs: config
 68 |     runs-on: ubuntu-latest
 69 |     if: needs.config.outputs.major-version == needs.config.outputs.release-major-version
 70 | 
 71 |     strategy:
 72 |       fail-fast: false
 73 |       matrix:
 74 |         include:
 75 |           - spark-compat-version: '3.0'
 76 |             spark-version: '3.0.3'
 77 |             scala-compat-version: '2.12'
 78 |             scala-version: '2.12.10'
 79 |           - spark-compat-version: '3.1'
 80 |             spark-version: '3.1.3'
 81 |             scala-compat-version: '2.12'
 82 |             scala-version: '2.12.10'
 83 |           - spark-compat-version: '3.2'
 84 |             spark-version: '3.2.4'
 85 |             scala-compat-version: '2.12'
 86 |             scala-version: '2.12.15'
 87 |           - spark-compat-version: '3.3'
 88 |             spark-version: '3.3.3'
 89 |             scala-compat-version: '2.12'
 90 |             scala-version: '2.12.15'
 91 |           - spark-compat-version: '3.4'
 92 |             scala-compat-version: '2.12'
 93 |             scala-version: '2.12.17'
 94 |             spark-version: '3.4.2'
 95 |           - spark-compat-version: '3.5'
 96 |             scala-compat-version: '2.12'
 97 |             scala-version: '2.12.18'
 98 |             spark-version: '3.5.0'
 99 | 
100 |     steps:
101 |       - name: Checkout
102 |         uses: actions/checkout@v4
103 | 
104 |       - name: Check
105 |         uses: ./.github/actions/check-compat
106 |         with:
107 |           spark-version: ${{ matrix.spark-version }}
108 |           scala-version: ${{ matrix.scala-version }}
109 |           spark-compat-version: ${{ matrix.spark-compat-version }}
110 |           scala-compat-version: ${{ matrix.scala-compat-version }}
111 |           package-version: ${{ needs.config.outputs.release-version }}
112 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | 
 3 | on:
 4 |   schedule:
 5 |     - cron: '0 8 */10 * *'
 6 |   push:
 7 |     tags:
 8 |       - '*'
 9 |   merge_group:
10 |   pull_request:
11 |   workflow_dispatch:
12 | 
13 | jobs:
14 |   event_file:
15 |     name: "Event File"
16 |     runs-on: ubuntu-latest
17 |     steps:
18 |       - name: Upload
19 |         uses: actions/upload-artifact@v4
20 |         with:
21 |           name: Event File
22 |           path: ${{ github.event_path }}
23 | 
24 |   build-jvm:
25 |     name: "Build JVM"
26 |     uses: "./.github/workflows/build-jvm.yml"
27 |   build-snapshots:
28 |     name: "Build Snapshots"
29 |     uses: "./.github/workflows/build-snapshots.yml"
30 |   build-python:
31 |     name: "Build Python"
32 |     needs: build-jvm
33 |     uses: "./.github/workflows/build-python.yml"
34 | 
35 |   test-jvm:
36 |     name: "Test JVM"
37 |     needs: build-jvm
38 |     uses: "./.github/workflows/test-jvm.yml"
39 |   test-python:
40 |     name: "Test Python"
41 |     needs: build-jvm
42 |     uses: "./.github/workflows/test-python.yml"
43 |   test-snapshots-jvm:
44 |     name: "Test Snapshots"
45 |     needs: build-snapshots
46 |     uses: "./.github/workflows/test-snapshots.yml"
47 | 
48 |   check:
49 |     name: "Check"
50 |     needs: build-jvm
51 |     uses: "./.github/workflows/check.yml"
52 | 
53 |   test_success:
54 |     name: "Test success"
55 |     if: always()
56 |     runs-on: ubuntu-latest
57 |     needs: [build-jvm, build-python, test-jvm, test-python]
58 | 
59 |     steps:
60 |       - name: "Success"
61 |         if: success()
62 |         run: true
63 |         shell: bash
64 |       - name: "Failure"
65 |         if: failure()
66 |         run: false
67 |         shell: bash
68 | 


--------------------------------------------------------------------------------
/.github/workflows/clear-caches.yaml:
--------------------------------------------------------------------------------
 1 | name: Clear caches
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | permissions:
 7 |   actions: write
 8 | 
 9 | jobs:
10 |   clear-cache:
11 |     runs-on: ubuntu-latest
12 |     steps:
13 |       - name: Clear caches
14 |         uses: actions/github-script@v7
15 |         with:
16 |           script: |
17 |             const caches = await github.paginate(
18 |               github.rest.actions.getActionsCacheList.endpoint.merge({
19 |                 owner: context.repo.owner,
20 |                 repo: context.repo.repo,
21 |               })
22 |             )
23 |             for (const cache of caches) {
24 |               console.log(cache)
25 |               github.rest.actions.deleteActionsCacheById({
26 |                 owner: context.repo.owner,
27 |                 repo: context.repo.repo,
28 |                 cache_id: cache.id,
29 |               })
30 |             }
31 | 
32 | 


--------------------------------------------------------------------------------
/.github/workflows/prime-caches.yml:
--------------------------------------------------------------------------------
 1 | name: Prime caches
 2 | 
 3 | on:
 4 |   workflow_dispatch:
 5 | 
 6 | jobs:
 7 |   test:
 8 |     name: Spark ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }} Scala ${{ matrix.scala-version }}
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       # keep in-sync with .github/workflows/test-jvm.yml
14 |       matrix:
15 |         scala-compat-version: ['2.12', '2.13']
16 |         spark-compat-version: ['3.4', '3.5']
17 |         spark-patch-version: ['0', '1', '2', '3', '4']
18 | 
19 |         include:
20 |           - spark-compat-version: '3.0'
21 |             scala-compat-version: '2.12'
22 |             scala-version: '2.12.10'
23 |             spark-patch-version: '3'
24 |             hadoop-version: '2.7'
25 |           - spark-compat-version: '3.1'
26 |             scala-compat-version: '2.12'
27 |             scala-version: '2.12.10'
28 |             spark-patch-version: '3'
29 |             hadoop-version: '2.7'
30 |           - spark-compat-version: '3.2'
31 |             scala-compat-version: '2.12'
32 |             scala-version: '2.12.15'
33 |             spark-patch-version: '4'
34 |             hadoop-version: '2.7'
35 |           - spark-compat-version: '3.3'
36 |             scala-compat-version: '2.12'
37 |             scala-version: '2.12.15'
38 |             spark-patch-version: '4'
39 |             hadoop-version: '3'
40 |           - spark-compat-version: '3.4'
41 |             scala-compat-version: '2.12'
42 |             scala-version: '2.12.17'
43 |             hadoop-version: '3'
44 |           - spark-compat-version: '3.5'
45 |             scala-compat-version: '2.12'
46 |             scala-version: '2.12.18'
47 |             hadoop-version: '3'
48 |           - spark-compat-version: '3.5'
49 |             scala-compat-version: '2.12'
50 |             scala-version: '2.12.18'
51 |             spark-patch-version: '5'
52 |             hadoop-version: '3'
53 | 
54 |           - spark-compat-version: '3.2'
55 |             scala-compat-version: '2.13'
56 |             scala-version: '2.13.5'
57 |             spark-patch-version: '4'
58 |             hadoop-version: '3.2'
59 |           - spark-compat-version: '3.3'
60 |             scala-compat-version: '2.13'
61 |             scala-version: '2.13.8'
62 |             spark-patch-version: '4'
63 |             hadoop-version: '3'
64 |           - spark-compat-version: '3.4'
65 |             scala-compat-version: '2.13'
66 |             scala-version: '2.13.8'
67 |             hadoop-version: '3'
68 |           - spark-compat-version: '3.5'
69 |             scala-compat-version: '2.13'
70 |             scala-version: '2.13.8'
71 |             hadoop-version: '3'
72 |           - spark-compat-version: '3.5'
73 |             scala-compat-version: '2.13'
74 |             scala-version: '2.13.8'
75 |             spark-patch-version: '5'
76 |             hadoop-version: '3'
77 | 
78 |     steps:
79 |       - name: Checkout
80 |         uses: actions/checkout@v4
81 | 
82 |       - name: Prime caches
83 |         uses: ./.github/actions/prime-caches
84 |         with:
85 |           spark-version: ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }}
86 |           scala-version: ${{ matrix.scala-version }}
87 |           spark-compat-version: ${{ matrix.spark-compat-version }}
88 |           scala-compat-version: ${{ matrix.scala-compat-version }}
89 |           hadoop-version: ${{ matrix.hadoop-version }}
90 |           java-compat-version: '8'
91 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-release.yml:
--------------------------------------------------------------------------------
  1 | name: Publish release
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |     inputs:
  6 |       versions:
  7 |         required: true
  8 |         type: string
  9 |         description: 'Example: {"include": [{"params": {"spark-version": "3.0.3","scala-version": "2.12.10"}}]}'
 10 |         default: |
 11 |           {
 12 |             "include": [
 13 |               {"params": {"spark-version": "3.0.3","scala-version": "2.12.10"}},
 14 |               {"params": {"spark-version": "3.1.3","scala-version": "2.12.10"}},
 15 |               {"params": {"spark-version": "3.2.4","scala-version": "2.12.15"}},
 16 |               {"params": {"spark-version": "3.3.4","scala-version": "2.12.15"}},
 17 |               {"params": {"spark-version": "3.4.4","scala-version": "2.12.17"}},
 18 |               {"params": {"spark-version": "3.5.5","scala-version": "2.12.18"}},
 19 |               {"params": {"spark-version": "3.2.4","scala-version": "2.13.5"}},
 20 |               {"params": {"spark-version": "3.3.4","scala-version": "2.13.8"}},
 21 |               {"params": {"spark-version": "3.4.4","scala-version": "2.13.8"}},
 22 |               {"params": {"spark-version": "3.5.5","scala-version": "2.13.8"}}
 23 |             ]
 24 |           }
 25 | 
 26 | env:
 27 |   # PySpark 3 versions only work with Python 3.9
 28 |   PYTHON_VERSION: "3.9"
 29 | 
 30 | jobs:
 31 |   maven-release:
 32 |     name: Publish maven release
 33 |     runs-on: ubuntu-latest
 34 |     if: ( ! github.event.repository.fork )
 35 |     # secrets are provided by environment
 36 |     environment:
 37 |       name: release
 38 |       # a different URL for each point in the matrix, but the same URLs accross commits
 39 |       url: 'https://github.com/G-Research/spark-extension?spark=${{ matrix.params.spark-version }}&scala=${{ matrix.params.scala-version }}'
 40 |     permissions:
 41 |       id-token: write # required for PiPy publish
 42 |     strategy:
 43 |       fail-fast: false
 44 |       matrix: ${{ fromJson(github.event.inputs.versions) }}
 45 | 
 46 |     steps:
 47 |       - name: Checkout release tag
 48 |         uses: actions/checkout@v4
 49 | 
 50 |       - name: Get versions
 51 |         id: versions
 52 |         run: |
 53 |           # get release version
 54 |           version=$(grep --max-count=1 "<version>.*</version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g" -e "s/-SNAPSHOT//" -e "s/-[0-9.]+//g")
 55 |           is_snapshot=$(if grep -q "<version>.*-SNAPSHOT</version>" pom.xml; then echo "true"; else echo "false"; fi)
 56 | 
 57 |           # share versions
 58 |           echo "release-tag=v${version}" >> "$GITHUB_OUTPUT"
 59 |           echo "is-snapshot=$is_snapshot" >> "$GITHUB_OUTPUT"
 60 | 
 61 |       - name: Check tag setup
 62 |         run: |
 63 |           # Check tag setup
 64 |           if [[ "$GITHUB_REF" != "refs/tags/v"* ]]
 65 |           then
 66 |             echo "This workflow must be run on a tag, not $GITHUB_REF"
 67 |             exit 1
 68 |           fi
 69 | 
 70 |           if [ "${{ steps.versions.outputs.is-snapshot }}" == "true" ]
 71 |           then
 72 |             echo "This is a tagged SNAPSHOT version. This is not allowed for release!"
 73 |             exit 1
 74 |           fi
 75 | 
 76 |           if [ "${{ github.ref_name }}" != "${{ steps.versions.outputs.release-tag }}" ]
 77 |           then
 78 |             echo "The version in the pom.xml is ${{ steps.versions.outputs.release-tag }}"
 79 |             echo "This tag is ${{ github.ref_name }}, which is different!"
 80 |             exit 1
 81 |           fi
 82 | 
 83 |       - name: Set up JDK and publish to Maven Central
 84 |         uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12  # v4.7.0
 85 |         with:
 86 |           java-version: '8'
 87 |           distribution: 'corretto'
 88 |           server-id: ossrh
 89 |           server-username: MAVEN_USERNAME
 90 |           server-password: MAVEN_PASSWORD
 91 |           gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
 92 |           gpg-passphrase: MAVEN_GPG_PASSPHRASE
 93 | 
 94 |       - name: Inspect GPG
 95 |         run: gpg -k
 96 | 
 97 |       - uses: actions/setup-python@v5 
 98 |         with:
 99 |           python-version: ${{ env.PYTHON_VERSION }}
100 | 
101 |       - name: Restore Maven packages cache
102 |         id: cache-maven
103 |         uses: actions/cache/restore@v4
104 |         with:
105 |           path: ~/.m2/repository
106 |           key: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}-${{ hashFiles('pom.xml') }}
107 |           restore-keys: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}-
108 | 
109 |       - name: Publish maven artifacts
110 |         id: publish-maven
111 |         run: |
112 |           ./set-version.sh ${{ matrix.params.spark-version }} ${{ matrix.params.scala-version }}
113 |           mvn clean deploy -Dsign -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true
114 |         env:
115 |           MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
116 |           MAVEN_PASSWORD: ${{ secrets.OSSRH_PASSWORD }}
117 |           MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE}}
118 | 
119 |       - name: Prepare PyPi package
120 |         id: prepare-pypi-package
121 |         if: ${{ matrix.params.scala-version }} == 2.12*
122 |         run: |
123 |           ./build-whl.sh
124 | 
125 |       - name: Publish package distributions to PyPI
126 |         uses: pypa/gh-action-pypi-publish@release/v1
127 |         if: ${{ matrix.params.scala-version }} == 2.12*
128 |         with:
129 |           user: ${{ secrets.PYPI_USERNAME }}
130 |           password: ${{ secrets.PYPI_PASSWORD }}
131 |           packages-dir: python/dist
132 |           verbose: true
133 | 


--------------------------------------------------------------------------------
/.github/workflows/publish-snapshot.yml:
--------------------------------------------------------------------------------
  1 | name: Publish snapshot
  2 | 
  3 | on:
  4 |   workflow_dispatch:
  5 |   push:
  6 |     branches: ["master"]
  7 | 
  8 | env:
  9 |   PYTHON_VERSION: "3.10"
 10 | 
 11 | jobs:
 12 |   check-version:
 13 |     name: Check SNAPSHOT version
 14 |     if: ( ! github.event.repository.fork )
 15 |     runs-on: ubuntu-latest
 16 |     permissions: {}
 17 |     outputs:
 18 |       is-snapshot: ${{ steps.check.outputs.is-snapshot }}
 19 | 
 20 |     steps:
 21 |       - name: Checkout code
 22 |         uses: actions/checkout@v4
 23 | 
 24 |       - name: Check if this is a SNAPSHOT version
 25 |         id: check
 26 |         run: |
 27 |           # check is snapshot version
 28 |           if grep -q "<version>.*-SNAPSHOT</version>" pom.xml
 29 |           then
 30 |             echo "Version in pom IS a SNAPSHOT version"
 31 |             echo "is-snapshot=true" >> "$GITHUB_OUTPUT"
 32 |           else
 33 |             echo "Version in pom is NOT a SNAPSHOT version"
 34 |             echo "is-snapshot=false" >> "$GITHUB_OUTPUT"
 35 |           fi
 36 | 
 37 |   snapshot:
 38 |     name: Snapshot Spark ${{ matrix.params.spark-version }} Scala ${{ matrix.params.scala-version }}
 39 |     needs: check-version
 40 |     # when we release from master, this workflow will see a commit that does not have a SNAPSHOT version
 41 |     # we want this workflow to skip over that commit
 42 |     if: needs.check-version.outputs.is-snapshot == 'true'
 43 |     runs-on: ubuntu-latest
 44 |     # secrets are provided by environment
 45 |     environment:
 46 |       name: snapshot
 47 |       # a different URL for each point in the matrix, but the same URLs accross commits
 48 |       url: 'https://github.com/G-Research/spark-extension?spark=${{ matrix.params.spark-version }}&scala=${{ matrix.params.scala-version }}&snapshot'
 49 |     permissions: {}
 50 |     strategy:
 51 |       fail-fast: false
 52 |       matrix:
 53 |         include:
 54 |           - params: {"spark-version": "3.0.3", "scala-version": "2.12.10", "scala-compat-version": "2.12"}
 55 |           - params: {"spark-version": "3.1.3", "scala-version": "2.12.10", "scala-compat-version": "2.12"}
 56 |           - params: {"spark-version": "3.2.4", "scala-version": "2.12.15", "scala-compat-version": "2.12"}
 57 |           - params: {"spark-version": "3.3.4", "scala-version": "2.12.15", "scala-compat-version": "2.12"}
 58 |           - params: {"spark-version": "3.4.4", "scala-version": "2.12.17", "scala-compat-version": "2.12"}
 59 |           - params: {"spark-version": "3.5.5", "scala-version": "2.12.18", "scala-compat-version": "2.12"}
 60 |           - params: {"spark-version": "3.2.4", "scala-version": "2.13.5", "scala-compat-version": "2.13"}
 61 |           - params: {"spark-version": "3.3.4", "scala-version": "2.13.8", "scala-compat-version": "2.13"}
 62 |           - params: {"spark-version": "3.4.4", "scala-version": "2.13.8", "scala-compat-version": "2.13"}
 63 |           - params: {"spark-version": "3.5.5", "scala-version": "2.13.8", "scala-compat-version": "2.13"}
 64 | 
 65 |     steps:
 66 |       - name: Checkout code
 67 |         uses: actions/checkout@v4
 68 | 
 69 |       - name: Set up JDK and publish to Maven Central
 70 |         uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12  # v4.7.0
 71 |         with:
 72 |           java-version: '8'
 73 |           distribution: 'corretto'
 74 |           server-id: ossrh
 75 |           server-username: MAVEN_USERNAME
 76 |           server-password: MAVEN_PASSWORD
 77 |           gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }}
 78 |           gpg-passphrase: MAVEN_GPG_PASSPHRASE
 79 | 
 80 |       - name: Inspect GPG
 81 |         run: gpg -k
 82 | 
 83 |       - uses: actions/setup-python@v5
 84 |         with:
 85 |           python-version: ${{ env.PYTHON_VERSION }}
 86 | 
 87 |       - name: Restore Maven packages cache
 88 |         id: cache-maven
 89 |         uses: actions/cache/restore@v4
 90 |         with:
 91 |           path: ~/.m2/repository
 92 |           key: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}-${{ hashFiles('pom.xml') }}
 93 |           restore-keys: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}-
 94 | 
 95 |       - name: Publish snapshot
 96 |         run: |
 97 |           ./set-version.sh ${{ matrix.params.spark-version }} ${{ matrix.params.scala-version }}
 98 |           mvn clean deploy -Dsign -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true
 99 |         env:
100 |           MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }}
101 |           MAVEN_PASSWORD: ${{ secrets.OSSRH_PASSWORD }}
102 |           MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE}}
103 | 
104 |       - name: Prepare PyPi package to test snapshot
105 |         if: ${{ matrix.params.scala-version }} == 2.12*
106 |         run: |
107 |           # Build whl
108 |           ./build-whl.sh
109 | 
110 |       - name: Restore Spark Binaries cache
111 |         uses: actions/cache/restore@v4
112 |         with:
113 |           path: ~/spark
114 |           key: ${{ runner.os }}-spark-binaries-${{ matrix.params.spark-version }}-${{ matrix.params.scala-compat-version }}
115 |           restore-keys:
116 |             ${{ runner.os }}-spark-binaries-${{ matrix.params.spark-version }}-${{ matrix.params.scala-compat-version }}
117 | 
118 |       - name: Rename Spark Binaries cache
119 |         run: |
120 |           mv ~/spark ./spark-${{ matrix.params.spark-version }}-${{ matrix.params.scala-compat-version }}
121 | 
122 |       - name: Test snapshot
123 |         id: test-package
124 |         run: |
125 |           # Test the snapshot (needs whl)
126 |           ./test-release.sh
127 | 


--------------------------------------------------------------------------------
/.github/workflows/test-jvm.yml:
--------------------------------------------------------------------------------
 1 | name: Test JVM
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   test:
 8 |     name: Test (Spark ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }} Scala ${{ matrix.scala-version }})
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       # keep in-sync with .github/workflows/prime-caches.yml
14 |       matrix:
15 |         scala-compat-version: ['2.12', '2.13']
16 |         spark-compat-version: ['3.4', '3.5']
17 |         spark-patch-version: ['0', '1', '2', '3', '4']
18 | 
19 |         include:
20 |           - spark-compat-version: '3.0'
21 |             scala-compat-version: '2.12'
22 |             scala-version: '2.12.10'
23 |             spark-patch-version: '3'
24 |             hadoop-version: '2.7'
25 |           - spark-compat-version: '3.1'
26 |             scala-compat-version: '2.12'
27 |             scala-version: '2.12.10'
28 |             spark-patch-version: '3'
29 |             hadoop-version: '2.7'
30 |           - spark-compat-version: '3.2'
31 |             scala-compat-version: '2.12'
32 |             scala-version: '2.12.15'
33 |             spark-patch-version: '4'
34 |             hadoop-version: '2.7'
35 |           - spark-compat-version: '3.3'
36 |             scala-compat-version: '2.12'
37 |             scala-version: '2.12.15'
38 |             spark-patch-version: '4'
39 |             hadoop-version: '3'
40 |           - spark-compat-version: '3.4'
41 |             scala-compat-version: '2.12'
42 |             scala-version: '2.12.17'
43 |             hadoop-version: '3'
44 |           - spark-compat-version: '3.5'
45 |             scala-compat-version: '2.12'
46 |             scala-version: '2.12.18'
47 |             hadoop-version: '3'
48 |           - spark-compat-version: '3.5'
49 |             scala-compat-version: '2.12'
50 |             scala-version: '2.12.18'
51 |             spark-patch-version: '5'
52 |             hadoop-version: '3'
53 | 
54 |           - spark-compat-version: '3.2'
55 |             scala-compat-version: '2.13'
56 |             scala-version: '2.13.5'
57 |             spark-patch-version: '4'
58 |             hadoop-version: '3.2'
59 |           - spark-compat-version: '3.3'
60 |             scala-compat-version: '2.13'
61 |             scala-version: '2.13.8'
62 |             spark-patch-version: '4'
63 |             hadoop-version: '3'
64 |           - spark-compat-version: '3.4'
65 |             scala-compat-version: '2.13'
66 |             scala-version: '2.13.8'
67 |             hadoop-version: '3'
68 |           - spark-compat-version: '3.5'
69 |             scala-compat-version: '2.13'
70 |             scala-version: '2.13.8'
71 |             hadoop-version: '3'
72 |           - spark-compat-version: '3.5'
73 |             scala-compat-version: '2.13'
74 |             scala-version: '2.13.8'
75 |             spark-patch-version: '5'
76 |             hadoop-version: '3'
77 | 
78 |     steps:
79 |       - name: Checkout
80 |         uses: actions/checkout@v4
81 | 
82 |       - name: Test
83 |         uses: ./.github/actions/test-jvm
84 |         env:
85 |           CI_SLOW_TESTS: 1
86 |         with:
87 |           spark-version: ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }}
88 |           scala-version: ${{ matrix.scala-version }}
89 |           spark-compat-version: ${{ matrix.spark-compat-version }}
90 |           scala-compat-version: ${{ matrix.scala-compat-version }}
91 |           hadoop-version: ${{ matrix.hadoop-version }}
92 |           java-compat-version: '8'
93 | 


--------------------------------------------------------------------------------
/.github/workflows/test-python.yml:
--------------------------------------------------------------------------------
 1 | name: Test Python
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   # pyspark is not available for snapshots or scala other than 2.12
 8 |   # we would have to compile spark from sources for this, not worth it
 9 |   test:
10 |     name: Test (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }} Python ${{ matrix.python-version }})
11 |     runs-on: ubuntu-latest
12 | 
13 |     strategy:
14 |       fail-fast: false
15 |       matrix:
16 |         spark-compat-version: ['3.2', '3.3', '3.4', '3.5']
17 |         python-version: ['3.9', '3.10', '3.11', '3.12', '3.13']
18 | 
19 |         include:
20 |           - spark-compat-version: '3.0'
21 |             spark-version: '3.0.3'
22 |             hadoop-version: '2.7'
23 |             scala-compat-version: '2.12'
24 |             scala-version: '2.12.10'
25 |             python-version: '3.8'
26 |           - spark-compat-version: '3.1'
27 |             spark-version: '3.1.3'
28 |             hadoop-version: '2.7'
29 |             scala-compat-version: '2.12'
30 |             scala-version: '2.12.10'
31 |             python-version: '3.8'
32 |           - spark-compat-version: '3.2'
33 |             spark-version: '3.2.4'
34 |             hadoop-version: '2.7'
35 |             scala-compat-version: '2.12'
36 |             scala-version: '2.12.15'
37 |           - spark-compat-version: '3.3'
38 |             spark-version: '3.3.4'
39 |             hadoop-version: '3'
40 |             scala-compat-version: '2.12'
41 |             scala-version: '2.12.15'
42 |           - spark-compat-version: '3.4'
43 |             spark-version: '3.4.4'
44 |             hadoop-version: '3'
45 |             scala-compat-version: '2.12'
46 |             scala-version: '2.12.17'
47 |           - spark-compat-version: '3.5'
48 |             spark-version: '3.5.5'
49 |             hadoop-version: '3'
50 |             scala-compat-version: '2.12'
51 |             scala-version: '2.12.18'
52 | 
53 |         exclude:
54 |           - spark-compat-version: '3.2'
55 |             python-version: '3.10'
56 |           - spark-compat-version: '3.2'
57 |             python-version: '3.11'
58 |           - spark-compat-version: '3.2'
59 |             python-version: '3.12'
60 |           - spark-compat-version: '3.2'
61 |             python-version: '3.13'
62 | 
63 |           - spark-compat-version: '3.3'
64 |             python-version: '3.11'
65 |           - spark-compat-version: '3.3'
66 |             python-version: '3.12'
67 |           - spark-compat-version: '3.3'
68 |             python-version: '3.13'
69 | 
70 |           - spark-compat-version: '3.4'
71 |             python-version: '3.12'
72 |           - spark-compat-version: '3.4'
73 |             python-version: '3.13'
74 | 
75 |           - spark-compat-version: '3.5'
76 |             python-version: '3.12'
77 |           - spark-compat-version: '3.5'
78 |             python-version: '3.13'
79 | 
80 |     steps:
81 |       - name: Checkout
82 |         uses: actions/checkout@v4
83 | 
84 |       - name: Test
85 |         uses: ./.github/actions/test-python
86 |         with:
87 |           spark-version: ${{ matrix.spark-version }}
88 |           scala-version: ${{ matrix.scala-version }}
89 |           spark-compat-version: ${{ matrix.spark-compat-version }}
90 |           scala-compat-version: ${{ matrix.scala-compat-version }}
91 |           hadoop-version: ${{ matrix.hadoop-version }}
92 |           python-version: ${{ matrix.python-version }}
93 | 


--------------------------------------------------------------------------------
/.github/workflows/test-results.yml:
--------------------------------------------------------------------------------
 1 | name: Test Results
 2 | 
 3 | on:
 4 |   workflow_run:
 5 |     workflows: ["CI"]
 6 |     types:
 7 |       - completed
 8 | permissions: {}
 9 | 
10 | jobs:
11 |   publish-test-results:
12 |     name: Publish Test Results
13 |     runs-on: ubuntu-latest
14 |     if: github.event.workflow_run.conclusion != 'skipped'
15 |     permissions:
16 |       checks: write
17 |       pull-requests: write
18 | 
19 |     steps:
20 |       - name: Download and Extract Artifacts
21 |         uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe
22 |         with:
23 |           run_id: ${{ github.event.workflow_run.id }}
24 |           name: "^Event File$| Test Results "
25 |           name_is_regexp: true
26 |           path: artifacts
27 | 
28 |       - name: Publish Test Results
29 |         uses: EnricoMi/publish-unit-test-result-action@v2
30 |         with:
31 |           commit: ${{ github.event.workflow_run.head_sha }}
32 |           event_file: artifacts/Event File/event.json
33 |           event_name: ${{ github.event.workflow_run.event }}
34 |           junit_files: "artifacts/* Test Results*/**/*.xml"
35 | 


--------------------------------------------------------------------------------
/.github/workflows/test-snapshots.yml:
--------------------------------------------------------------------------------
 1 | name: Test Snapshots
 2 | 
 3 | on:
 4 |   workflow_call:
 5 | 
 6 | jobs:
 7 |   test:
 8 |     name: Test (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }})
 9 |     runs-on: ubuntu-latest
10 | 
11 |     strategy:
12 |       fail-fast: false
13 |       matrix:
14 |         include:
15 |           - spark-compat-version: '3.2'
16 |             spark-version: '3.2.5-SNAPSHOT'
17 |             scala-compat-version: '2.12'
18 |             scala-version: '2.12.15'
19 |             java-compat-version: '8'
20 |           - spark-compat-version: '3.3'
21 |             spark-version: '3.3.5-SNAPSHOT'
22 |             scala-compat-version: '2.12'
23 |             scala-version: '2.12.15'
24 |             java-compat-version: '8'
25 |           - spark-compat-version: '3.4'
26 |             spark-version: '3.4.5-SNAPSHOT'
27 |             scala-compat-version: '2.12'
28 |             scala-version: '2.12.17'
29 |             java-compat-version: '8'
30 |           - spark-compat-version: '3.5'
31 |             spark-version: '3.5.6-SNAPSHOT'
32 |             scala-compat-version: '2.12'
33 |             scala-version: '2.12.17'
34 |             java-compat-version: '8'
35 | 
36 |           - spark-compat-version: '3.2'
37 |             spark-version: '3.2.5-SNAPSHOT'
38 |             scala-compat-version: '2.13'
39 |             scala-version: '2.13.5'
40 |             java-compat-version: '8'
41 |           - spark-compat-version: '3.3'
42 |             spark-version: '3.3.5-SNAPSHOT'
43 |             scala-compat-version: '2.13'
44 |             scala-version: '2.13.8'
45 |             java-compat-version: '8'
46 |           - spark-compat-version: '3.4'
47 |             spark-version: '3.4.5-SNAPSHOT'
48 |             scala-compat-version: '2.13'
49 |             scala-version: '2.13.8'
50 |             java-compat-version: '8'
51 |           - spark-compat-version: '3.5'
52 |             spark-version: '3.5.6-SNAPSHOT'
53 |             scala-compat-version: '2.13'
54 |             scala-version: '2.13.8'
55 |             java-compat-version: '8'
56 |           - spark-compat-version: '4.0'
57 |             spark-version: '4.0.1-SNAPSHOT'
58 |             scala-compat-version: '2.13'
59 |             scala-version: '2.13.16'
60 |             java-compat-version: '17'
61 |           - spark-compat-version: '4.1'
62 |             spark-version: '4.1.0-SNAPSHOT'
63 |             scala-compat-version: '2.13'
64 |             scala-version: '2.13.16'
65 |             java-compat-version: '17'
66 | 
67 |     steps:
68 |       - name: Checkout
69 |         uses: actions/checkout@v4
70 | 
71 |       - name: Test
72 |         uses: ./.github/actions/test-jvm
73 |         env:
74 |           CI_SLOW_TESTS: 1
75 |         with:
76 |           spark-version: ${{ matrix.spark-version }}
77 |           scala-version: ${{ matrix.scala-version }}
78 |           spark-compat-version: ${{ matrix.spark-compat-version }}-SNAPSHOT
79 |           scala-compat-version: ${{ matrix.scala-compat-version }}
80 |           java-compat-version: ${{ matrix.java-compat-version }}
81 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # use glob syntax.
 2 | syntax: glob
 3 | *.ser
 4 | *.class
 5 | *~
 6 | *.bak
 7 | #*.off
 8 | *.old
 9 | 
10 | # eclipse conf file
11 | .settings
12 | .classpath
13 | .project
14 | .manager
15 | .scala_dependencies
16 | 
17 | # idea
18 | .idea
19 | *.iml
20 | 
21 | # building
22 | target
23 | build
24 | null
25 | tmp*
26 | temp*
27 | dist
28 | test-output
29 | build.log
30 | 
31 | # other scm
32 | .svn
33 | .CVS
34 | .hg*
35 | 
36 | # switch to regexp syntax.
37 | #  syntax: regexp
38 | #  ^\.pc/
39 | 
40 | #SHITTY output not in target directory
41 | build.log
42 | 
43 | # project specific
44 | python/**/__pycache__
45 | python/requirements.txt
46 | spark-*
47 | .cache


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | version = 3.7.17
2 | runner.dialect = scala213
3 | rewrite.trailingCommas.style = keep
4 | docstrings.style = Asterisk
5 | maxColumn = 120
6 | 
7 | 


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | All notable changes to this project will be documented in this file.
  3 | 
  4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).
  5 | 
  6 | ## [2.13.0] - 2024-11-04
  7 | 
  8 | ### Fixes
  9 | - Support diff for Spark Connect implemened via PySpark Dataset API (#251)
 10 | 
 11 | ### Added
 12 | - Add ignore columns to diff in Python API (#252)
 13 | - Check that the Java / Scala package is installed when needed by Python (#250)
 14 | 
 15 | ## [2.12.0] - 2024-04-26
 16 | 
 17 | ## Fixes
 18 | 
 19 | - Diff change column should respect comparators (#238)
 20 | 
 21 | ## Changed
 22 | 
 23 | - Make create_temporary_dir work with pyspark-extension only (#222).
 24 |   This allows [installing PIP packages and Poetry projects](PYSPARK-DEPS.md)
 25 |   via pure Python spark-extension package (Maven package not required any more).
 26 | - Add map diff comparator to Python API (#226)
 27 | 
 28 | ## [2.11.0] - 2024-01-04
 29 | 
 30 | ### Added
 31 | 
 32 | - Add count_null aggregate function (#206)
 33 | - Support reading parquet schema (#208)
 34 | - Add more columns to reading parquet metadata (#209, #211)
 35 | - Provide groupByKey shortcuts for groupBy.as (#213)
 36 | - Allow to install PIP packages into PySpark job (#215)
 37 | - Allow to install Poetry projects into PySpark job (#216)
 38 | 
 39 | ## [2.10.0] - 2023-09-27
 40 | 
 41 | ### Fixed
 42 | 
 43 | - Update setup.py to include parquet methods in python package (#191)
 44 | 
 45 | ### Added
 46 | 
 47 | - Add --statistics option to diff app (#189)
 48 | - Add --filter option to diff app (#190)
 49 | 
 50 | ## [2.9.0] - 2023-08-23
 51 | 
 52 | ### Added
 53 | 
 54 | - Add key order sensitive map comparator (#187)
 55 | 
 56 | ### Changed
 57 | 
 58 | - Use dataset encoder rather than implicit value encoder for implicit dataset extension class (#183)
 59 | 
 60 | ### Fixed
 61 | 
 62 | - Fix key-sensitivity in map comparator (#186)
 63 | 
 64 | ## [2.8.0] - 2023-05-24
 65 | 
 66 | ### Added
 67 | 
 68 | - Add method to set and automatically unset Spark job description. (#172)
 69 | - Add column function that converts between .Net (C#, F#, Visual Basic) `DateTime.Ticks` and Spark timestamp / Unix epoch timestamps. (#153)
 70 | 
 71 | ## [2.7.0] - 2023-05-05
 72 | 
 73 | ### Added
 74 | 
 75 | - Spark app to diff files or tables and write result back to file or table. (#160)
 76 | - Add null value count to `parquetBlockColumns` and `parquet_block_columns`. (#162)
 77 | - Add `parallelism` argument to Parquet metadata methods. (#164)
 78 | 
 79 | ### Changed
 80 | 
 81 | - Change data type of column name in `parquetBlockColumns` and `parquet_block_columns` to array of strings.
 82 |   Cast to string to get earlier behaviour (string column name). (#162)
 83 | 
 84 | ## [2.6.0] - 2023-04-11
 85 | 
 86 | ### Added
 87 | 
 88 | -  Add reader for parquet metadata. (#154)
 89 | 
 90 | ## [2.5.0] - 2023-03-23
 91 | 
 92 | ### Added
 93 | 
 94 | - Add whitespace agnostic diff comparator. (#137)
 95 | - Add Python whl package build. (#151)
 96 | 
 97 | ## [2.4.0] - 2022-12-08
 98 | 
 99 | ### Added
100 | 
101 | - Allow for custom diff equality. (#127)
102 | 
103 | ### Fixed
104 | 
105 | - Fix Python API calling into Scala code. (#132)
106 | 
107 | ## [2.3.0] - 2022-10-26
108 | 
109 | ### Added
110 | 
111 | - Add diffWith to Scala, Java and Python Diff API. (#109)
112 | 
113 | ### Changed
114 | 
115 | - Diff similar Datasets with ignoreColumns. Before, only similar DataFrame could be diffed with ignoreColumns. (#111)
116 | 
117 | ### Fixed
118 | 
119 | - Cache before writing via partitionedBy to work around SPARK-40588. Unpersist via UnpersistHandle. (#124)
120 | 
121 | ## [2.2.0] - 2022-07-21
122 | 
123 | ### Added
124 | - Add (global) row numbers transformation to Scala, Java and Python API. (#97)
125 | 
126 | ### Removed
127 | - Removed support for Pyton 3.6
128 | 
129 | ## [2.1.0] - 2022-04-07
130 | 
131 | ### Added
132 | - Add sorted group methods to Dataset. (#76)
133 | 
134 | ## [2.0.0] - 2021-10-29
135 | 
136 | ### Added
137 | - Add support for Spark 3.2 and Scala 2.13.
138 | - Support to ignore columns in diff API. (#63)
139 | 
140 | ### Removed
141 | - Removed support for Spark 2.4.
142 | 
143 | ## [1.3.3] - 2020-12-17
144 | 
145 | ### Added
146 | - Add support for Spark 3.1.
147 | 
148 | ## [1.3.2] - 2020-12-17
149 | 
150 | ### Changed
151 | - Refine conditional transformation helper methods.
152 | 
153 | ## [1.3.1] - 2020-12-10
154 | 
155 | ### Changed
156 | - Refine conditional transformation helper methods.
157 | 
158 | ## [1.3.0] - 2020-12-07
159 | 
160 | ### Added
161 | - Add transformation to compute histogram. (#26)
162 | - Add conditional transformation helper methods. (#27)
163 | - Add partitioned writing helpers that simplifies writing optimally ordered partitioned data. (#29)
164 | 
165 | ## [1.2.0] - 2020-10-06
166 | 
167 | ### Added
168 | - Add diff modes (#22): column-by-column, side-by-side, left and right side diff modes.
169 | - Adds sparse mode (#23): diff DataFrame contains only changed values.
170 | 
171 | ## [1.1.0] - 2020-08-24
172 | 
173 | ### Added
174 | - Add Python API for Diff transformation.
175 | - Add change column to Diff transformation providing column names of all changed columns in a row.
176 | - Add fluent methods to change immutable diff options.
177 | - Add `backticks` method to handle column names that contain dots (`.`).
178 | 
179 | ## [1.0.0] - 2020-03-12
180 | 
181 | ### Added
182 | - Add Diff transformation for Datasets.
183 | 


--------------------------------------------------------------------------------
/CONDITIONAL.md:
--------------------------------------------------------------------------------
 1 | # DataFrame Transformations
 2 | 
 3 | The Spark `Dataset` API allows for chaining transformations as in the following example:
 4 | 
 5 | ```scala
 6 | ds.where($"id" === 1)
 7 |   .withColumn("state", lit("new"))
 8 |   .orderBy($"timestamp")
 9 | ```
10 | 
11 | When you define additional transformation functions, the `Dataset` API allows you to
12 | also fluently call into those:
13 | 
14 | ```scala
15 | def transformation(df: DataFrame): DataFrame = df.distinct
16 | 
17 | ds.transform(transformation)
18 | ```
19 | 
20 | Here are some methods that extend this principle to conditional calls.
21 | 
22 | ## Conditional Transformations
23 | 
24 | You can run a transformation after checking a condition with a chain of fluent transformation calls:
25 | 
26 | ```scala
27 | import uk.co.gresearch._
28 | 
29 | val condition = true
30 | 
31 | val result =
32 |   ds.where($"id" === 1)
33 |     .withColumn("state", lit("new"))
34 |     .when(condition).call(transformation)
35 |     .orderBy($"timestamp")
36 | ```
37 | 
38 | rather than
39 | 
40 | ```scala
41 | val condition = true
42 | 
43 | val filteredDf = ds.where($"id" === 1)
44 |                    .withColumn("state", lit("new"))
45 | val condDf = if (condition) ds.call(transformation) else ds
46 | val result = ds.orderBy($"timestamp")
47 | ```
48 | 
49 | In case you need an else transformation as well, try:
50 | 
51 | ```scala
52 | import uk.co.gresearch._
53 | 
54 | val condition = true
55 | 
56 | val result =
57 |   ds.where($"id" === 1)
58 |     .withColumn("state", lit("new"))
59 |     .on(condition).either(transformation).or(other)
60 |     .orderBy($"timestamp")
61 | ```
62 | 
63 | ## Fluent and conditional functions elsewhere
64 | 
65 | The same fluent notation works for instances other than `Dataset` or `DataFrame`, e.g.
66 | for the `DataFrameWriter`:
67 | 
68 | ```scala
69 | def writeData[T](writer: DataFrameWriter[T]): Unit = { ... }
70 | 
71 | ds.write
72 |   .when(compress).call(_.option("compression", "gzip"))
73 |   .call(writeData)
74 | ```
75 | 


--------------------------------------------------------------------------------
/GROUPS.md:
--------------------------------------------------------------------------------
 1 | # Sorted Groups
 2 | 
 3 | Spark provides the ability to group rows by an arbitrary key,
 4 | while then providing an iterator for each of these groups.
 5 | This allows to iterate over groups that are too large to fit into memory:
 6 | 
 7 | ```scala
 8 | import org.apache.spark.sql.Dataset
 9 | 
10 | import spark.implicits._
11 | 
12 | case class Val(id: Int, seq: Int, value: Double)
13 | 
14 | val ds: Dataset[Val] = Seq(
15 |   Val(1, 1, 1.1),
16 |   Val(1, 2, 1.2),
17 |   Val(1, 3, 1.3),
18 | 
19 |   Val(2, 1, 2.1),
20 |   Val(2, 2, 2.2),
21 |   Val(2, 3, 2.3),
22 | 
23 |   Val(3, 1, 3.1)
24 | ).reverse.toDS().repartition(3).cache()
25 | 
26 | // order of iterator IS NOT guaranteed
27 | ds.groupByKey(v => v.id)
28 |   .flatMapGroups((key, it) => it.zipWithIndex.map(v => (key, v._2, v._1.seq, v._1.value)))
29 |   .toDF("key", "index", "seq", "value")
30 |   .show(false)
31 | 
32 | +---+-----+---+-----+
33 | |key|index|seq|value|
34 | +---+-----+---+-----+
35 | |1  |0    |3  |1.3  |
36 | |1  |1    |2  |1.2  |
37 | |1  |2    |1  |1.1  |
38 | |2  |0    |1  |2.1  |
39 | |2  |1    |3  |2.3  |
40 | |2  |2    |2  |2.2  |
41 | |3  |0    |1  |3.1  |
42 | +---+-----+---+-----+
43 | ```
44 | 
45 | However, we have no control over the order of the group iterators.
46 | If we want the iterators to be ordered according to `seq`, we can do the following:
47 | 
48 | ```scala
49 | import uk.co.gresearch.spark._
50 | 
51 | // the group key $"id" needs an ordering
52 | implicit val ordering: Ordering.Int.type = Ordering.Int
53 | 
54 | // order of iterator IS guaranteed
55 | ds.groupBySorted($"id")($"seq")
56 |   .flatMapSortedGroups((key, it) => it.zipWithIndex.map(v => (key, v._2, v._1.seq, v._1.value)))
57 |   .toDF("key", "index", "seq", "value")
58 |   .show(false)
59 | 
60 | +---+-----+---+-----+
61 | |key|index|seq|value|
62 | +---+-----+---+-----+
63 | |1  |0    |1  |1.1  |
64 | |1  |1    |2  |1.2  |
65 | |1  |2    |3  |1.3  |
66 | |2  |0    |1  |2.1  |
67 | |2  |1    |2  |2.2  |
68 | |2  |2    |3  |2.3  |
69 | |3  |0    |1  |3.1  |
70 | +---+-----+---+-----+
71 | ```
72 | 
73 | Now, iterators are ordered according to `seq`, which is proven by the value of `index`,
74 | that has been generated by `it.zipWithIndex`.
75 | 
76 | Instead of column expressions, we can also use lambdas to define group key and group order:
77 | ```scala
78 | ds.groupByKeySorted(v => v.id)(v => v.seq)
79 |   .flatMapSortedGroups((key, it) => it.zipWithIndex.map(v => (key, v._2, v._1.seq, v._1.value)))
80 |   .toDF("key", "index", "seq", "value")
81 |   .show(false)
82 | ```
83 | 
84 | **Note:** Using lambdas here hides from Spark which columns we use for grouping and sorting.
85 | Query optimization cannot improve partitioning and sorting in this case. Use column expressions when possible.
86 | 


--------------------------------------------------------------------------------
/HISTOGRAM.md:
--------------------------------------------------------------------------------
 1 | # Histogram
 2 | 
 3 | For a table `df` like
 4 | 
 5 | |user   |score|
 6 | |:-----:|:---:|
 7 | |Alice  |101  |
 8 | |Alice  |221  |
 9 | |Alice  |211  |
10 | |Alice  |176  |
11 | |Bob    |276  |
12 | |Bob    |232  |
13 | |Bon    |258  |
14 | |Charlie|221  |
15 | 
16 | you can compute the histogram for each user
17 | 
18 | |user   |≤100 |≤200 |>200 |
19 | |:-----:|:---:|:---:|:---:|
20 | |Alice  |0    |2    |2    |
21 | |Bob    |0    |0    |3    |
22 | |Charlie|0    |0    |1    |
23 | 
24 | as follows:
25 | 
26 |     df.withColumn("≤100", when($"score" <= 100, 1).otherwise(0))
27 |       .withColumn("≤200", when($"score" > 100 && $"score" <= 200, 1).otherwise(0))
28 |       .withColumn(">200", when($"score" > 200, 1).otherwise(0))
29 |       .groupBy($"user")
30 |       .agg(
31 |         sum($"≤100").as("≤100"),
32 |         sum($"≤200").as("≤200"),
33 |         sum($">200").as(">200")
34 |       )
35 |       .orderBy($"user")
36 | 
37 | Equivalent to that query is:
38 | 
39 |     import uk.co.gresearch.spark._
40 | 
41 |     df.histogram(Seq(100, 200), $"score", $"user").orderBy($"user")
42 | 
43 | The first argument is a sequence of thresholds, the second argument provides the value column.
44 | The subsequent arguments refer to the aggregation columns (`groupBy`). Only aggregation columns
45 | will be in the result DataFrame.
46 | 
47 | In Java, call:
48 | 
49 |     import uk.co.gresearch.spark.Histogram;
50 | 
51 |     Histogram.of(df, Arrays.asList(100, 200), new Column("score")), new Column("user")).orderBy($"user")
52 | 
53 | In Python, call:
54 | 
55 |     import gresearch.spark
56 | 
57 |     df.histogram([100, 200], 'user').orderBy('user')
58 | 
59 | Note that this feature is not supported in Python when connected with a [Spark Connect server](README.md#spark-connect-server).
60 | 


--------------------------------------------------------------------------------
/MAINTAINERS.md:
--------------------------------------------------------------------------------
1 | ## Current maintainers of the project
2 | 
3 | | Maintainer             | GitHub ID                                               |
4 | | ---------------------- | ------------------------------------------------------- |
5 | | Enrico Minack          | [EnricoMi](https://github.com/EnricoMi)                 |
6 | 


--------------------------------------------------------------------------------
/PYSPARK-DEPS.md:
--------------------------------------------------------------------------------
  1 | # PySpark dependencies
  2 | 
  3 | Using PySpark on a cluster requires all cluster nodes to have those Python packages installed that are required by the PySpark job.
  4 | Such a deployment can be cumbersome, especially when running in an interactive notebook.
  5 | 
  6 | The `spark-extension` package allows installing Python packages programmatically by the PySpark application itself (PySpark ≥ 3.1.0).
  7 | These packages are only accessible by that PySpark application, and they are removed on calling `spark.stop()`.
  8 | 
  9 | Either install the `spark-extension` Maven package, or the `pyspark-extension` PyPi package (on the driver only),
 10 | as described [here](README.md#using-spark-extension).
 11 | 
 12 | ## Installing packages with `pip`
 13 | 
 14 | Python packages can be installed with `pip` as follows:
 15 | 
 16 | ```python
 17 | # noinspection PyUnresolvedReferences
 18 | from gresearch.spark import *
 19 | 
 20 | spark.install_pip_package("pandas", "pyarrow")
 21 | ```
 22 | 
 23 | Above example installs PIP packages `pandas` and `pyarrow` via `pip`. Method `install_pip_package` takes any `pip` command line argument:
 24 | 
 25 | ```python
 26 | # install packages with version specs
 27 | spark.install_pip_package("pandas==1.4.3", "pyarrow~=8.0.0")
 28 | 
 29 | # install packages from package sources (e.g. git clone https://github.com/pandas-dev/pandas.git)
 30 | spark.install_pip_package("./pandas/")
 31 | 
 32 | # install packages from git repo
 33 | spark.install_pip_package("git+https://github.com/pandas-dev/pandas.git@main")
 34 | 
 35 | # use a pip cache directory to cache downloaded and built whl files
 36 | spark.install_pip_package("pandas", "pyarrow", "--cache-dir", "/home/user/.cache/pip")
 37 | 
 38 | # use an alternative index url (other than https://pypi.org/simple)
 39 | spark.install_pip_package("pandas", "pyarrow", "--index-url", "https://artifacts.company.com/pypi/simple")
 40 | 
 41 | # install pip packages quietly (only disables output of PIP)
 42 | spark.install_pip_package("pandas", "pyarrow", "--quiet")
 43 | ```
 44 | 
 45 | ## Installing Python projects with Poetry
 46 | 
 47 | Python projects can be installed from sources, including their dependencies, using [Poetry](https://python-poetry.org/):
 48 | 
 49 | ```python
 50 | # noinspection PyUnresolvedReferences
 51 | from gresearch.spark import *
 52 | 
 53 | spark.install_poetry_project("../my-poetry-project/", poetry_python="../venv-poetry/bin/python")
 54 | ```
 55 | 
 56 | ## Example
 57 | 
 58 | This example uses `install_pip_package` in a Spark standalone cluster.
 59 | 
 60 | First checkout the example code:
 61 | 
 62 | ```shell
 63 | git clone https://github.com/G-Research/spark-extension.git
 64 | cd spark-extension/examples/python-deps
 65 | ```
 66 | 
 67 | Build a Docker image based on the official Spark release:
 68 | ```shell
 69 | docker build -t spark-extension-example-docker .
 70 | ```
 71 | 
 72 | Start the example Spark standalone cluster consisting of a Spark master and one worker:
 73 | ```shell
 74 | docker compose -f docker-compose.yml up -d
 75 | ```
 76 | 
 77 | Run the `example.py` Spark application on the example cluster:
 78 | ```shell
 79 | docker exec spark-master spark-submit --master spark://master:7077 --packages uk.co.gresearch.spark:spark-extension_2.12:2.13.0-3.5 /example/example.py
 80 | ```
 81 | The `--packages uk.co.gresearch.spark:spark-extension_2.12:2.13.0-3.5` argument
 82 | tells `spark-submit` to add the `spark-extension` Maven package to the Spark job.
 83 | 
 84 | Alternatively, install the `pyspark-extension` PyPi package via `pip install` and remove the `--packages` argument from `spark-submit`:
 85 | ```shell
 86 | docker exec spark-master pip install --user pyspark_extension==2.11.1.3.5
 87 | docker exec spark-master spark-submit --master spark://master:7077 /example/example.py
 88 | ```
 89 | 
 90 | This output proves that PySpark could call into the function `func`, wich only works when Pandas and PyArrow are installed:
 91 | ```
 92 | +---+
 93 | | id|
 94 | +---+
 95 | |  0|
 96 | |  1|
 97 | |  2|
 98 | +---+
 99 | ```
100 | 
101 | Test that `spark.install_pip_package("pandas", "pyarrow")` is really required by this example by removing this line from `example.py` …
102 | ```diff
103 |  from pyspark.sql import SparkSession
104 | 
105 |  def main():
106 |      spark = SparkSession.builder.appName("spark_app").getOrCreate()
107 | 
108 |      def func(df):
109 |          return df
110 | 
111 |      from gresearch.spark import install_pip_package
112 | 
113 | -    spark.install_pip_package("pandas", "pyarrow")
114 |      spark.range(0, 3, 1, 5).mapInPandas(func, "id long").show()
115 | 
116 |  if __name__ == "__main__":
117 |      main()
118 | ```
119 | 
120 | … and running the `spark-submit` command again. The example does not work anymore,
121 | because the Pandas and PyArrow packages are missing from the driver:
122 | ```
123 | Traceback (most recent call last):
124 |   File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/pandas/utils.py", line 27, in require_minimum_pandas_version
125 | ModuleNotFoundError: No module named 'pandas'
126 | ```
127 | 
128 | Finally, shutdown the example cluster:
129 | ```shell
130 | docker compose -f docker-compose.yml down
131 | ```
132 | 
133 | ## Known Issues
134 | 
135 | Note that this feature is not supported in Python when connected with a [Spark Connect server](README.md#spark-connect-server).
136 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | # Security and Coordinated Vulnerability Disclosure Policy
2 | 
3 | This project appreciates and encourages coordinated disclosure of security vulnerabilities. We prefer that you use the GitHub reporting mechanism to privately report vulnerabilities. Under the main repository's security tab, click "Report a vulnerability" to open the advisory form.
4 | 
5 | If you are unable to report it via GitHub, have received no response after repeated attempts, or have other security related questions, please contact security@gr-oss.io and mention this project in the subject line.


--------------------------------------------------------------------------------
/build-whl.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eo pipefail
 4 | 
 5 | base=$(cd "$(dirname "$0")"; pwd)
 6 | 
 7 | version=$(grep --max-count=1 "<version>.*</version>" "$base/pom.xml" | sed -E -e "s/\s*<[^>]+>//g")
 8 | artifact_id=$(grep --max-count=1 "<artifactId>.*</artifactId>" "$base/pom.xml" | sed -E -e "s/\s*<[^>]+>//g")
 9 | 
10 | rm -rf "$base/python/pyspark/jars/"
11 | mkdir -p "$base/python/pyspark/jars/"
12 | cp -v "$base/target/$artifact_id-$version.jar" "$base/python/pyspark/jars/"
13 | if [ $(ls -1 "$base/python/pyspark/jars/" | wc -l) -ne 1 ]
14 | then
15 |   echo "There are more than one jar in '$base/python/pyspark/jars/'"
16 |   ls -lah "$base/python/pyspark/jars/"
17 |   exit 1
18 | fi
19 | 
20 | pip install build
21 | python -m build "$base/python/"
22 | 
23 | # check for missing modules in whl file
24 | pyversion=${version/SNAPSHOT/dev0}
25 | pyversion=${pyversion//-/.}
26 | 
27 | missing="$(diff <(cd $base/python; find gresearch -type f | grep -v ".pyc$" | sort) <(unzip -l $base/python/dist/pyspark_extension-${pyversion}-*.whl | tail -n +4 | head -n -2 | sed -E -e "s/^ +//" -e "s/ +/ /g" | cut -d " " -f 4- | sort) | grep "^<" || true)"
28 | if [ -n "$missing" ]
29 | then
30 |   echo "These files are missing from the whl file:"
31 |   echo "$missing"
32 |   exit 1
33 | fi
34 | 


--------------------------------------------------------------------------------
/bump-version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | #
 3 | # Copyright 2020 G-Research
 4 | #
 5 | # Licensed under the Apache License, Version 2.0 (the "License");
 6 | # you may not use this file except in compliance with the License.
 7 | # You may obtain a copy of the License at
 8 | #
 9 | #      http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | #
17 | 
18 | # Script to prepare release, see RELEASE.md for details
19 | 
20 | set -e -o pipefail
21 | 
22 | # check for clean git status
23 | readarray -t git_status < <(git status -s --untracked-files=no 2>/dev/null)
24 | if [ ${#git_status[@]} -gt 0 ]
25 | then
26 |   echo "There are pending git changes:"
27 |   for (( i=0; i<${#git_status[@]}; i++ )); do echo "${git_status[$i]}" ; done
28 |   exit 1
29 | fi
30 | 
31 | function next_version {
32 |   local version=$1
33 |   local branch=$2
34 | 
35 |   patch=${version/*./}
36 |   majmin=${version%.${patch}}
37 | 
38 |   if [[ $branch == "master" ]]
39 |   then
40 |     # minor version bump
41 |     if [[ $version != *".0" ]]
42 |     then
43 |       echo "version is patch version, should be M.m.0: $version" >&2
44 |       exit 1
45 |     fi
46 |     maj=${version/.*/}
47 |     min=${majmin#${maj}.}
48 |     next=${maj}.$((min+1)).0
49 |     echo "$next"
50 |   else
51 |     # patch version bump
52 |     next=${majmin}.$((patch+1))
53 |     echo "$next"
54 |   fi
55 | }
56 | 
57 | # get release and next version
58 | version=$(grep --max-count=1 "<version>.*</version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
59 | pkg_version="${version/-*/}"
60 | branch=$(git rev-parse --abbrev-ref HEAD)
61 | next_pkg_version="$(next_version "$pkg_version" "$branch")"
62 | 
63 | # bump the version
64 | echo "Bump version to $next_pkg_version"
65 | ./set-version.sh $next_pkg_version-SNAPSHOT
66 | 
67 | # commit changes to local repo
68 | echo
69 | echo "Committing release to local git"
70 | git commit -a -m "Post-release version bump to $next_pkg_version"
71 | git show HEAD
72 | echo
73 | 
74 | # push version bump to origin
75 | echo "Press <ENTER> to push commit to origin"
76 | read
77 | 
78 | echo "Pushing release commit to origin"
79 | git push origin "master"
80 | echo
81 | 


--------------------------------------------------------------------------------
/examples/python-deps/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM apache/spark:3.5.0
2 | 
3 | ENV PATH="${PATH}:/opt/spark/bin"
4 | 
5 | USER root
6 | RUN mkdir -p /home/spark; chown spark:spark /home/spark
7 | USER spark
8 | 


--------------------------------------------------------------------------------
/examples/python-deps/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: "3"
 2 | services:
 3 |   master:
 4 |     container_name: spark-master
 5 |     image: spark-extension-example-docker
 6 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master -h master
 7 |     environment:
 8 |       MASTER: spark://master:7077
 9 |       SPARK_PUBLIC_DNS: localhost
10 |       SPARK_MASTER_WEBUI_PORT: 8080
11 |       PYSPARK_PYTHON: python${PYTHON_VERSION:-3.8}
12 |       PYSPARK_DRIVER_PYTHON: python${PYTHON_VERSION:-3.8}
13 |     expose:
14 |       - 7077
15 |     ports:
16 |       - 4040:4040
17 |       - 8080:8080
18 |     volumes:
19 |       - ./:/example
20 | 
21 |   worker:
22 |     container_name: spark-worker
23 |     image: spark-extension-example-docker
24 |     command: /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077
25 |     environment:
26 |       SPARK_WORKER_CORES: 1
27 |       SPARK_WORKER_MEMORY: 1g
28 |       SPARK_WORKER_PORT: 8881
29 |       SPARK_WORKER_WEBUI_PORT: 8081
30 |       SPARK_PUBLIC_DNS: localhost
31 |     links:
32 |       - master
33 |     ports:
34 |       - 8081:8081
35 | 
36 | 


--------------------------------------------------------------------------------
/examples/python-deps/example.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | def main():
 4 |     spark = SparkSession.builder.appName("spark_app").getOrCreate()
 5 | 
 6 |     def func(df):
 7 |         return df
 8 | 
 9 |     from gresearch.spark import install_pip_package
10 | 
11 |     spark.install_pip_package("pandas", "pyarrow")
12 |     spark.range(0, 3, 1, 5).mapInPandas(func, "id long").show()
13 | 
14 | if __name__ == "__main__":
15 |     main()
16 | 


--------------------------------------------------------------------------------
/python/gresearch/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2020 G-Research
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/python/gresearch/spark/diff/comparator/__init__.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2022 G-Research
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | import abc
 16 | import dataclasses
 17 | from dataclasses import dataclass
 18 | 
 19 | from py4j.java_gateway import JVMView, JavaObject
 20 | 
 21 | from pyspark.sql import Column
 22 | from pyspark.sql.functions import abs, greatest, lit
 23 | from pyspark.sql.types import DataType
 24 | 
 25 | 
 26 | class DiffComparator(abc.ABC):
 27 |     @abc.abstractmethod
 28 |     def equiv(self, left: Column, right: Column) -> Column:
 29 |         pass
 30 | 
 31 | 
 32 | class DiffComparators:
 33 |     @staticmethod
 34 |     def default() -> 'DefaultDiffComparator':
 35 |         return DefaultDiffComparator()
 36 | 
 37 |     @staticmethod
 38 |     def nullSafeEqual() -> 'NullSafeEqualDiffComparator':
 39 |         return NullSafeEqualDiffComparator()
 40 | 
 41 |     @staticmethod
 42 |     def epsilon(epsilon: float) -> 'EpsilonDiffComparator':
 43 |         return EpsilonDiffComparator(epsilon)
 44 | 
 45 |     @staticmethod
 46 |     def string(whitespace_agnostic: bool = True) -> 'StringDiffComparator':
 47 |         return StringDiffComparator(whitespace_agnostic)
 48 | 
 49 |     @staticmethod
 50 |     def duration(duration: str) -> 'DurationDiffComparator':
 51 |         return DurationDiffComparator(duration)
 52 | 
 53 |     @staticmethod
 54 |     def map(key_type: DataType, value_type: DataType, key_order_sensitive: bool = False) -> 'MapDiffComparator':
 55 |         return MapDiffComparator(key_type, value_type, key_order_sensitive)
 56 | 
 57 | 
 58 | class NullSafeEqualDiffComparator(DiffComparator):
 59 |     def equiv(self, left: Column, right: Column) -> Column:
 60 |         return left.eqNullSafe(right)
 61 | 
 62 | 
 63 | class DefaultDiffComparator(NullSafeEqualDiffComparator):
 64 |     # for testing only
 65 |     def _to_java(self, jvm: JVMView) -> JavaObject:
 66 |         return jvm.uk.co.gresearch.spark.diff.DiffComparators.default()
 67 | 
 68 | 
 69 | @dataclass(frozen=True)
 70 | class EpsilonDiffComparator(DiffComparator):
 71 |     epsilon: float
 72 |     relative: bool = True
 73 |     inclusive: bool = True
 74 | 
 75 |     def as_relative(self) -> 'EpsilonDiffComparator':
 76 |         return dataclasses.replace(self, relative=True)
 77 | 
 78 |     def as_absolute(self) -> 'EpsilonDiffComparator':
 79 |         return dataclasses.replace(self, relative=False)
 80 | 
 81 |     def as_inclusive(self) -> 'EpsilonDiffComparator':
 82 |         return dataclasses.replace(self, inclusive=True)
 83 | 
 84 |     def as_exclusive(self) -> 'EpsilonDiffComparator':
 85 |         return dataclasses.replace(self, inclusive=False)
 86 | 
 87 |     def equiv(self, left: Column, right: Column) -> Column:
 88 |         threshold = greatest(abs(left), abs(right)) * self.epsilon if self.relative else lit(self.epsilon)
 89 | 
 90 |         def inclusive_epsilon(diff: Column) -> Column:
 91 |             return diff.__le__(threshold)
 92 | 
 93 |         def exclusive_epsilon(diff: Column) -> Column:
 94 |             return diff.__lt__(threshold)
 95 | 
 96 |         in_epsilon = inclusive_epsilon if self.inclusive else exclusive_epsilon
 97 |         return left.isNull() & right.isNull() | left.isNotNull() & right.isNotNull() & in_epsilon(abs(left - right))
 98 | 
 99 | 
100 | @dataclass(frozen=True)
101 | class StringDiffComparator(DiffComparator):
102 |     whitespace_agnostic: bool
103 | 
104 |     def equiv(self, left: Column, right: Column) -> Column:
105 |         return left.eqNullSafe(right)
106 | 
107 | 
108 | @dataclass(frozen=True)
109 | class DurationDiffComparator(DiffComparator):
110 |     duration: str
111 |     inclusive: bool = True
112 | 
113 |     def as_inclusive(self) -> 'DurationDiffComparator':
114 |         return dataclasses.replace(self, inclusive=True)
115 | 
116 |     def as_exclusive(self) -> 'DurationDiffComparator':
117 |         return dataclasses.replace(self, inclusive=False)
118 | 
119 |     def equiv(self, left: Column, right: Column) -> Column:
120 |         return left.eqNullSafe(right)
121 | 
122 | 
123 | @dataclass(frozen=True)
124 | class MapDiffComparator(DiffComparator):
125 |     key_type: DataType
126 |     value_type: DataType
127 |     key_order_sensitive: bool
128 | 
129 |     def equiv(self, left: Column, right: Column) -> Column:
130 |         return left.eqNullSafe(right)
131 | 


--------------------------------------------------------------------------------
/python/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.0_2.12.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.0.3
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.1_2.12.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.1.3
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.2_2.12.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.2.2
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.2_2.13.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.2.2
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.3_2.12.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.3.1
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.3_2.13.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.3.1
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.4_2.12.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.4.0
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.4_2.13.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.4.0
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.5_2.12.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.5.0
4 | 


--------------------------------------------------------------------------------
/python/requirements-3.5_2.13.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==3.5.0
4 | 


--------------------------------------------------------------------------------
/python/requirements-4.0_2.13.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==4.0.0
4 | 


--------------------------------------------------------------------------------
/python/requirements-4.1_2.13.txt:
--------------------------------------------------------------------------------
1 | py4j
2 | # keep in-sync with pom.xml
3 | pyspark==4.0.0
4 | 


--------------------------------------------------------------------------------
/python/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | #  Copyright 2023 G-Research
 4 | #
 5 | #  Licensed under the Apache License, Version 2.0 (the "License");
 6 | #  you may not use this file except in compliance with the License.
 7 | #  You may obtain a copy of the License at
 8 | #
 9 | #       http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | #  Unless required by applicable law or agreed to in writing, software
12 | #  distributed under the License is distributed on an "AS IS" BASIS,
13 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | #  See the License for the specific language governing permissions and
15 | #  limitations under the License.
16 | 
17 | from pathlib import Path
18 | from setuptools import setup
19 | 
20 | jar_version = '2.14.0-3.5-SNAPSHOT'
21 | scala_version = '2.13.8'
22 | scala_compat_version = '.'.join(scala_version.split('.')[:2])
23 | spark_compat_version = jar_version.split('-')[1]
24 | version = jar_version.replace('SNAPSHOT', 'dev0').replace('-', '.')
25 | 
26 | # read the contents of the README.md file
27 | long_description = (Path(__file__).parent / "README.md").read_text()
28 | 
29 | setup(
30 |     name="pyspark-extension",
31 |     version=version,
32 |     description="A library that provides useful extensions to Apache Spark.",
33 |     long_description=long_description,
34 |     long_description_content_type="text/markdown",
35 |     author="Enrico Minack",
36 |     author_email="github@enrico.minack.dev",
37 |     url="https://github.com/G-Research/spark-extension",
38 |     tests_require=[f"pyspark~={spark_compat_version}.0", "py4j"],
39 |     packages=[
40 |         "gresearch",
41 |         "gresearch.spark",
42 |         "gresearch.spark.diff",
43 |         "gresearch.spark.diff.comparator",
44 |         "gresearch.spark.parquet",
45 |         "pyspark.jars",
46 |     ],
47 |     include_package_data=False,
48 |     package_data={
49 |         "pyspark.jars": [f"*_{scala_compat_version}-{jar_version}.jar"],
50 |     },
51 |     license="http://www.apache.org/licenses/LICENSE-2.0.html",
52 |     python_requires=">=3.7",
53 |     classifiers=[
54 |         "Development Status :: 5 - Production/Stable",
55 |         "License :: OSI Approved :: Apache Software License",
56 |         "Programming Language :: Python :: 3",
57 |         "Programming Language :: Python :: 3.7",
58 |         "Programming Language :: Python :: 3.8",
59 |         "Programming Language :: Python :: 3.9",
60 |         "Programming Language :: Python :: 3.10",
61 |         "Programming Language :: Python :: 3.11",
62 |         "Programming Language :: Python :: 3.12",
63 |         "Programming Language :: Python :: 3.13",
64 |         "Programming Language :: Python :: Implementation :: CPython",
65 |         "Programming Language :: Python :: Implementation :: PyPy",
66 |         "Typing :: Typed",
67 |     ],
68 | )
69 | 


--------------------------------------------------------------------------------
/python/test/__init__.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2020 G-Research
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | 


--------------------------------------------------------------------------------
/python/test/requirements.txt:
--------------------------------------------------------------------------------
1 | grpcio>=1.48.1
2 | pandas>=1.0.5
3 | pyarrow>=4.0.0
4 | pytest
5 | unittest-xml-reporting
6 | 


--------------------------------------------------------------------------------
/python/test/spark_common.py:
--------------------------------------------------------------------------------
  1 | #  Copyright 2020 G-Research
  2 | #
  3 | #  Licensed under the Apache License, Version 2.0 (the "License");
  4 | #  you may not use this file except in compliance with the License.
  5 | #  You may obtain a copy of the License at
  6 | #
  7 | #       http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | #  Unless required by applicable law or agreed to in writing, software
 10 | #  distributed under the License is distributed on an "AS IS" BASIS,
 11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | #  See the License for the specific language governing permissions and
 13 | #  limitations under the License.
 14 | 
 15 | import logging
 16 | import os
 17 | import sys
 18 | import unittest
 19 | from contextlib import contextmanager
 20 | from pathlib import Path
 21 | 
 22 | from pyspark import SparkConf
 23 | from pyspark.sql import SparkSession
 24 | 
 25 | logger = logging.getLogger()
 26 | logger.level = logging.INFO
 27 | 
 28 | 
 29 | @contextmanager
 30 | def spark_session():
 31 |     session = SparkTest.get_spark_session()
 32 |     try:
 33 |         yield session
 34 |     finally:
 35 |         session.stop()
 36 | 
 37 | 
 38 | class SparkTest(unittest.TestCase):
 39 | 
 40 |     @staticmethod
 41 |     def main(file: str):
 42 |         if len(sys.argv) == 2:
 43 |             # location to store test results provided, this requires package unittest-xml-reporting
 44 |             import xmlrunner
 45 | 
 46 |             unittest.main(
 47 |                 module=f'test.{Path(file).name[:-3]}',
 48 |                 testRunner=xmlrunner.XMLTestRunner(output=sys.argv[1]),
 49 |                 argv=sys.argv[:1],
 50 |                 # these make sure that some options that are not applicable
 51 |                 # remain hidden from the help menu.
 52 |                 failfast=False, buffer=False, catchbreak=False
 53 |             )
 54 |         else:
 55 |             unittest.main()
 56 | 
 57 |     @staticmethod
 58 |     def get_pom_path() -> str:
 59 |         paths = ['.', '..', os.path.join('..', '..')]
 60 |         for path in paths:
 61 |             if os.path.exists(os.path.join(path, 'pom.xml')):
 62 |                 return path
 63 |         raise RuntimeError('Could not find path to pom.xml, looked here: {}'.format(', '.join(paths)))
 64 | 
 65 |     @staticmethod
 66 |     def get_spark_config(path) -> SparkConf:
 67 |         master = 'local[2]'
 68 |         conf = SparkConf().setAppName('unit test').setMaster(master)
 69 |         return conf.setAll([
 70 |             ('spark.ui.showConsoleProgress', 'false'),
 71 |             ('spark.test.home', os.environ.get('SPARK_HOME')),
 72 |             ('spark.locality.wait', '0'),
 73 |             ('spark.driver.extraClassPath', '{}'.format(':'.join([
 74 |                 os.path.join(os.getcwd(), path, 'target', 'classes'),
 75 |                 os.path.join(os.getcwd(), path, 'target', 'test-classes'),
 76 |             ]))),
 77 |         ])
 78 | 
 79 |     @classmethod
 80 |     def get_spark_session(cls) -> SparkSession:
 81 |         builder = SparkSession.builder
 82 | 
 83 |         if 'TEST_SPARK_CONNECT_SERVER' in os.environ:
 84 |             builder.remote(os.environ['TEST_SPARK_CONNECT_SERVER'])
 85 |         elif 'PYSPARK_GATEWAY_PORT' in os.environ:
 86 |             logging.info('Running inside existing Spark environment')
 87 |         else:
 88 |             logging.info('Setting up Spark environment')
 89 |             path = cls.get_pom_path()
 90 |             conf = cls.get_spark_config(path)
 91 |             builder.config(conf=conf)
 92 | 
 93 |         return builder.getOrCreate()
 94 | 
 95 |     spark: SparkSession = None
 96 |     is_spark_connect: bool = 'TEST_SPARK_CONNECT_SERVER' in os.environ
 97 | 
 98 |     @classmethod
 99 |     def setUpClass(cls):
100 |         super(SparkTest, cls).setUpClass()
101 |         logging.info('launching Spark session')
102 |         cls.spark = cls.get_spark_session()
103 | 
104 |     @classmethod
105 |     def tearDownClass(cls):
106 |         logging.info('stopping Spark session')
107 |         cls.spark.stop()
108 |         super(SparkTest, cls).tearDownClass()
109 | 
110 |     @contextmanager
111 |     def sql_conf(self, pairs):
112 |         """
113 |         Copied from pyspark/testing/sqlutils available from PySpark 3.5.0 and higher.
114 |         https://github.com/apache/spark/blob/v3.5.0/python/pyspark/testing/sqlutils.py#L171
115 |         http://www.apache.org/licenses/LICENSE-2.0
116 | 
117 |         A convenient context manager to test some configuration specific logic. This sets
118 |         `value` to the configuration `key` and then restores it back when it exits.
119 |         """
120 |         assert isinstance(pairs, dict), "pairs should be a dictionary."
121 |         assert hasattr(self, "spark"), "it should have 'spark' attribute, having a spark session."
122 | 
123 |         keys = pairs.keys()
124 |         new_values = pairs.values()
125 |         old_values = [self.spark.conf.get(key, None) for key in keys]
126 |         for key, new_value in zip(keys, new_values):
127 |             self.spark.conf.set(key, new_value)
128 |         try:
129 |             yield
130 |         finally:
131 |             for key, old_value in zip(keys, old_values):
132 |                 if old_value is None:
133 |                     self.spark.conf.unset(key)
134 |                 else:
135 |                     self.spark.conf.set(key, old_value)
136 | 


--------------------------------------------------------------------------------
/python/test/test_histogram.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2020 G-Research
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from unittest import skipIf
16 | 
17 | from spark_common import SparkTest
18 | import gresearch.spark
19 | 
20 | 
21 | @skipIf(SparkTest.is_spark_connect, "Spark Connect does not provide access to the JVM, required by Historgam")
22 | class HistogramTest(SparkTest):
23 | 
24 |     @classmethod
25 |     def setUpClass(cls):
26 |         super(HistogramTest, cls).setUpClass()
27 | 
28 |         cls.df = cls.spark.createDataFrame([
29 |             (1, 1),
30 |             (1, 2),
31 |             (1, 10),
32 |             (2, -3),
33 |             (2, 5),
34 |             (3, 8),
35 |         ], ['id', 'value'])
36 | 
37 |     def test_histogram_with_ints(self):
38 |         hist = self.df.histogram([-5, 0, 5], 'value', 'id').orderBy('id').collect()
39 |         self.assertEqual([
40 |             {'id': 1, '≤-5': 0, '≤0': 0, '≤5': 2, '>5': 1},
41 |             {'id': 2, '≤-5': 0, '≤0': 1, '≤5': 1, '>5': 0},
42 |             {'id': 3, '≤-5': 0, '≤0': 0, '≤5': 0, '>5': 1},
43 |         ], [row.asDict() for row in hist])
44 | 
45 |     def test_histogram_with_floats(self):
46 |         hist = self.df.histogram([-5.0, 0.0, 5.0], 'value', 'id').orderBy('id').collect()
47 |         self.assertEqual([
48 |             {'id': 1, '≤-5.0': 0, '≤0.0': 0, '≤5.0': 2, '>5.0': 1},
49 |             {'id': 2, '≤-5.0': 0, '≤0.0': 1, '≤5.0': 1, '>5.0': 0},
50 |             {'id': 3, '≤-5.0': 0, '≤0.0': 0, '≤5.0': 0, '>5.0': 1},
51 |         ], [row.asDict() for row in hist])
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     SparkTest.main(__file__)
56 | 


--------------------------------------------------------------------------------
/python/test/test_job_description.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2023 G-Research
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from unittest import skipIf
16 | 
17 | from pyspark import TaskContext, SparkContext
18 | from typing import Optional
19 | 
20 | from spark_common import SparkTest
21 | from gresearch.spark import job_description, append_job_description
22 | 
23 | 
24 | @skipIf(SparkTest.is_spark_connect, "Spark Connect does not provide access to the JVM, required by JobDescription")
25 | class JobDescriptionTest(SparkTest):
26 | 
27 |     def _assert_job_description(self, expected: Optional[str]):
28 |         def get_job_description_func(part):
29 |             def func(row):
30 |                 return row.id, part, TaskContext.get().getLocalProperty("spark.job.description")
31 |             return func
32 | 
33 |         descriptions = self.spark.range(3, numPartitions=3).rdd \
34 |             .mapPartitionsWithIndex(lambda part, it: map(get_job_description_func(part), it)) \
35 |             .collect()
36 |         self.assertEqual(
37 |             [(0, 0, expected), (1, 1, expected), (2, 2, expected)],
38 |             descriptions
39 |         )
40 | 
41 |     def setUp(self) -> None:
42 |         SparkContext._active_spark_context.setJobDescription(None)
43 | 
44 |     def test_with_job_description(self):
45 |         self._assert_job_description(None)
46 |         with job_description("job description"):
47 |             self._assert_job_description("job description")
48 |             with job_description("inner job description"):
49 |                 self._assert_job_description("inner job description")
50 |             self._assert_job_description("job description")
51 |             with job_description("inner job description", True):
52 |                 self._assert_job_description("job description")
53 |             self._assert_job_description("job description")
54 |         self._assert_job_description(None)
55 |         with job_description("other job description", True):
56 |             self._assert_job_description("other job description")
57 |         self._assert_job_description(None)
58 | 
59 |     def test_append_job_description(self):
60 |         self._assert_job_description(None)
61 |         with append_job_description("job"):
62 |             self._assert_job_description("job")
63 |             with append_job_description("description"):
64 |                 self._assert_job_description("job - description")
65 |             self._assert_job_description("job")
66 |             with append_job_description("description 2", " "):
67 |                 self._assert_job_description("job description 2")
68 |             self._assert_job_description("job")
69 |         self._assert_job_description(None)
70 | 
71 | 
72 | if __name__ == '__main__':
73 |     SparkTest.main(__file__)
74 | 


--------------------------------------------------------------------------------
/python/test/test_parquet.py:
--------------------------------------------------------------------------------
 1 | #  Copyright 2023 G-Research
 2 | #
 3 | #  Licensed under the Apache License, Version 2.0 (the "License");
 4 | #  you may not use this file except in compliance with the License.
 5 | #  You may obtain a copy of the License at
 6 | #
 7 | #       http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | #  Unless required by applicable law or agreed to in writing, software
10 | #  distributed under the License is distributed on an "AS IS" BASIS,
11 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | #  See the License for the specific language governing permissions and
13 | #  limitations under the License.
14 | 
15 | from pathlib import Path
16 | from unittest import skipIf
17 | 
18 | from spark_common import SparkTest
19 | import gresearch.spark.parquet
20 | 
21 | 
22 | @skipIf(SparkTest.is_spark_connect, "Spark Connect does not provide access to the JVM, required by Parquet")
23 | class ParquetTest(SparkTest):
24 | 
25 |     test_file = str((Path(__file__).parent.parent.parent / "src" / "test" / "files" / "test.parquet").resolve())
26 | 
27 |     def test_parquet_metadata(self):
28 |         self.assertEqual(self.spark.read.parquet_metadata(self.test_file).count(), 2)
29 |         self.assertEqual(self.spark.read.parquet_metadata(self.test_file, self.test_file).count(), 2)
30 |         self.assertEqual(self.spark.read.parquet_metadata(self.test_file, parallelism=100).count(), 2)
31 |         self.assertEqual(self.spark.read.parquet_metadata(self.test_file, self.test_file, parallelism=100).count(), 2)
32 | 
33 |     def test_parquet_schema(self):
34 |         self.assertEqual(self.spark.read.parquet_schema(self.test_file).count(), 4)
35 |         self.assertEqual(self.spark.read.parquet_schema(self.test_file, self.test_file).count(), 4)
36 |         self.assertEqual(self.spark.read.parquet_schema(self.test_file, parallelism=100).count(), 4)
37 |         self.assertEqual(self.spark.read.parquet_schema(self.test_file, self.test_file, parallelism=100).count(), 4)
38 | 
39 |     def test_parquet_blocks(self):
40 |         self.assertEqual(self.spark.read.parquet_blocks(self.test_file).count(), 3)
41 |         self.assertEqual(self.spark.read.parquet_blocks(self.test_file, self.test_file).count(), 3)
42 |         self.assertEqual(self.spark.read.parquet_blocks(self.test_file, parallelism=100).count(), 3)
43 |         self.assertEqual(self.spark.read.parquet_blocks(self.test_file, self.test_file, parallelism=100).count(), 3)
44 | 
45 |     def test_parquet_block_columns(self):
46 |         self.assertEqual(self.spark.read.parquet_block_columns(self.test_file).count(), 6)
47 |         self.assertEqual(self.spark.read.parquet_block_columns(self.test_file, self.test_file).count(), 6)
48 |         self.assertEqual(self.spark.read.parquet_block_columns(self.test_file, parallelism=100).count(), 6)
49 |         self.assertEqual(self.spark.read.parquet_block_columns(self.test_file, self.test_file, parallelism=100).count(), 6)
50 | 
51 |     def test_parquet_partitions(self):
52 |         self.assertEqual(self.spark.read.parquet_partitions(self.test_file).count(), 2)
53 |         self.assertEqual(self.spark.read.parquet_partitions(self.test_file, self.test_file).count(), 2)
54 |         self.assertEqual(self.spark.read.parquet_partitions(self.test_file, parallelism=100).count(), 2)
55 |         self.assertEqual(self.spark.read.parquet_partitions(self.test_file, self.test_file, parallelism=100).count(), 2)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     SparkTest.main(__file__)
60 | 


--------------------------------------------------------------------------------
/set-version.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | if [ $# -eq 1 ]
 4 | then
 5 |     IFS=-
 6 |     read version flavour <<< "$1"
 7 | 
 8 |     echo "setting version=$version${flavour:+ with }$flavour"
 9 | 
10 |     sed -i -E \
11 |         -e "s%^(  <version>)[^-]+-([^-]+).*(</version>)$%\1$version-\2${flavour:+-}$flavour\3%" \
12 |         pom.xml
13 | 
14 |     version=$(grep -m 1 version pom.xml | sed "s/\s*<[^>]*>\s*//g")
15 | 
16 |     sed -i -E \
17 |         -e "s/(jar_version *= *).*/\1'$version'/" \
18 |         python/setup.py
19 | elif [ $# -eq 2 ]
20 | then
21 |     spark=$1
22 |     scala=$2
23 | 
24 |     spark_compat=${spark%.*}
25 |     scala_compat=${scala%.*}
26 | 
27 |     spark_major=${spark_compat%.*}
28 |     scala_major=${scala_compat%.*}
29 | 
30 |     spark_minor=${spark_compat/*./}
31 |     scala_minor=${scala_compat/*./}
32 | 
33 |     spark_patch=${spark/*./}
34 |     scala_patch=${scala/*./}
35 | 
36 |     echo "setting spark=$spark and scala=$scala"
37 |     sed -i -E \
38 |         -e "s%^(  <artifactId>)([^_]+)[_0-9.]+(</artifactId>)$%\1\2_${scala_compat}\3%" \
39 |         -e "s%^(  <version>)([^-]+)-[^-]+(.*</version>)$%\1\2-$spark_compat\3%" \
40 |         -e "s%^(    <scala.major.version>).+(</scala.major.version>)$%\1${scala_major}\2%" \
41 |         -e "s%^(    <scala.minor.version>).+(</scala.minor.version>)$%\1${scala_minor}\2%" \
42 |         -e "s%^(    <scala.patch.version>).+(</scala.patch.version>)$%\1${scala_patch}\2%" \
43 |         -e "s%^(    <spark.major.version>).+(</spark.major.version>)$%\1${spark_major}\2%" \
44 |         -e "s%^(    <spark.minor.version>).+(</spark.minor.version>)$%\1${spark_minor}\2%" \
45 |         -e "s%^(    <spark.patch.version>).+(</spark.patch.version>)$%\1${spark_patch}\2%" \
46 |         pom.xml
47 | 
48 |     version=$(grep -m 1 version pom.xml | sed "s/\s*<[^>]*>\s*//g")
49 | 
50 |     sed -i -E \
51 |         -e "s/(jar_version *= *).*/\1'$version'/" \
52 |         -e "s/(scala_version *= *).*/\1'$scala'/" \
53 |         python/setup.py
54 | else
55 |     echo "Provide the Spark-Extension version (e.g. 2.5.0 or 2.5.0-SNAPSHOT), or the Spark and Scala version"
56 |     exit 1
57 | fi
58 | 
59 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/org/apache/spark/sql/extension/package.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.catalyst.expressions.Expression
20 | 
21 | object UnixMicros {
22 |   def unixMicros(child: Expression): Expression = {
23 |     throw new NotImplementedError()
24 |   }
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.1/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.1/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/SplitFile.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/org/apache/spark/sql/extension/package.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.catalyst.trees.TreeNode
20 | 
21 | /**
22 |  * Spark version specific trait that back-ports BinaryLike[T].withNewChildrenInternal(T, T)
23 |  * to Spark 3.0 and 3.1. This is empty in Spark 3.2 and beyond.
24 |  */
25 | trait BinaryLikeWithNewChildrenInternal[T <: TreeNode[T]] {
26 |   self: TreeNode[T] =>
27 | 
28 |   /**
29 |    * Method `withNewChildrenInternal` is required for Spark 3.2 and beyond.
30 |    * Before, `withNewChildren` is called by Spark, which uses `makeCopy`, which
31 |    *   "Must be overridden by child classes that have constructor arguments
32 |    *    that are not present in the productIterator.",
33 |    * which is not true for where BinaryLikeWithNewChildrenInternal is used here.
34 |    * So nothing need to be overridden.
35 |    */
36 |   protected def withNewChildrenInternal(newLeft: T, newRight: T): T
37 | }


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.catalyst.expressions
20 | import org.apache.spark.sql.catalyst.expressions.Expression
21 | 
22 | object UnixMicros {
23 |   def unixMicros(child: Expression): Expression = expressions.UnixMicros(child)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.parquet.hadoop.metadata.BlockMetaData
20 | 
21 | object BlockMetaDataUtil {
22 |   def getOrdinal(block: BlockMetaData): Option[Int] = None
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.parquet.schema.PrimitiveType
20 | 
21 | object PrimitiveTypeUtil {
22 |   def getLogicalTypeAnnotation(primitive: PrimitiveType): Option[String] = None
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/SplitFile.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/org/apache/spark/sql/extension/package.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | /**
20 |  * Spark version specific trait that back-ports BinaryLike[T].withNewChildrenInternal(T, T)
21 |  * to Spark 3.0 and 3.1. This is empty in Spark 3.2 and beyond.
22 |  */
23 | trait BinaryLikeWithNewChildrenInternal[T]
24 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.parquet.hadoop.metadata.BlockMetaData
20 | 
21 | object BlockMetaDataUtil {
22 |   def getOrdinal(block: BlockMetaData): Option[Int] = Some(block.getOrdinal)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.parquet.schema.PrimitiveType
20 | 
21 | object PrimitiveTypeUtil {
22 |   def getLogicalTypeAnnotation(primitive: PrimitiveType): Option[String] =
23 |     Option(primitive.getLogicalTypeAnnotation).map(_.toString)
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.spark.sql.execution.datasources.PartitionedFile
20 | 
21 | case class SplitFile(filePath: String, start: Long, length: Long, fileSize: Option[Long])
22 | 
23 | object SplitFile {
24 |   def apply(file: PartitionedFile): SplitFile = SplitFile(file.filePath, file.start, file.length, None)
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/org/apache/spark/sql/extension/package.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.spark.sql.execution.datasources.PartitionedFile
20 | 
21 | case class SplitFile(filePath: String, start: Long, length: Long, fileSize: Option[Long])
22 | 
23 | object SplitFile {
24 |   def apply(file: PartitionedFile): SplitFile = SplitFile(file.filePath, file.start, file.length, Some(file.fileSize))
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/org/apache/spark/sql/extension/package.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.parquet.hadoop.metadata.FileMetaData
20 | 
21 | object FileMetaDataUtil {
22 |   def getEncryptionType(fileMetaData: FileMetaData): Option[String] = None
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-4.0/uk/co/gresearch/spark/parquet/SplitFile.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/org/apache/spark/sql/extension/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2024 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.sql
18 | 
19 | import org.apache.spark.sql.catalyst.expressions.Expression
20 | 
21 | package object extension {
22 |   implicit class ColumnExtension(col: Column) {
23 |     // Column.expr exists in this Spark version
24 |     def sql: String = col.expr.sql
25 |   }
26 | 
27 |   implicit class ExpressionExtension(expr: Expression) {
28 |     def column: Column = new Column(expr)
29 |   }
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2021 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import java.util.regex.Pattern
20 | 
21 | object Backticks {
22 | 
23 |   // https://github.com/apache/spark/blob/523ff15/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/QuotingUtils.scala#L46
24 |   private val validIdentPattern = Pattern.compile("^[a-zA-Z_][a-zA-Z0-9_]*")
25 | 
26 |   /**
27 |    * Detects if column name part requires quoting.
28 |    * https://github.com/apache/spark/blob/523ff15/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/QuotingUtils.scala#L48
29 |    */
30 |   private def needQuote(part: String): Boolean = {
31 |     !validIdentPattern.matcher(part).matches()
32 |   }
33 | 
34 |   /**
35 |    * Encloses the given strings with backticks (backquotes) if needed.
36 |    *
37 |    * Backticks are not needed for strings that start with a letter (`a`-`z` and `A`-`Z`) or an underscore,
38 |    * and contain only letters, numbers and underscores.
39 |    *
40 |    * Multiple strings will be enclosed individually and concatenated with dots (`.`).
41 |    *
42 |    * This is useful when referencing column names that contain special characters like dots (`.`) or backquotes.
43 |    *
44 |    * Examples:
45 |    * {{{
46 |    *   col("a.column")                                    // this references the field "column" of column "a"
47 |    *   col("`a.column`")                                  // this reference the column with the name "a.column"
48 |    *   col(Backticks.column_name("column"))               // produces "column"
49 |    *   col(Backticks.column_name("a.column"))             // produces "`a.column`"
50 |    *   col(Backticks.column_name("a column"))             // produces "`a column`"
51 |    *   col(Backticks.column_name("`a.column`"))           // produces "`a.column`"
52 |    *   col(Backticks.column_name("a.column", "a.field"))  // produces "`a.column`.`a.field`"
53 |    * }}}
54 |    *
55 |    * @param string
56 |    *   a string
57 |    * @param strings
58 |    *   more strings
59 |    * @return
60 |    */
61 |   @scala.annotation.varargs
62 |   def column_name(string: String, strings: String*): String =
63 |     (string +: strings)
64 |       .map(s => if (needQuote(s)) s"`${s.replace("`", "``")}`" else s)
65 |       .mkString(".")
66 | 
67 | }
68 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.parquet.hadoop.metadata.FileMetaData
20 | 
21 | object FileMetaDataUtil {
22 |   def getEncryptionType(fileMetaData: FileMetaData): Option[String] =
23 |     Some(fileMetaData.getEncryptionType.name())
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-4.0/uk/co/gresearch/spark/parquet/SplitFile.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/org/apache/spark/sql/extension/extension.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2024 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package org.apache.spark.sql
18 | 
19 | import org.apache.spark.sql.catalyst.expressions.Expression
20 | import org.apache.spark.sql.classic.ExpressionUtils.{column => toColumn, expression}
21 | 
22 | package object extension {
23 |   implicit class ColumnExtension(col: Column) {
24 |     def expr: Expression = expression(col)
25 |     def sql: String = col.node.sql
26 |   }
27 | 
28 |   implicit class ExpressionExtension(expr: Expression) {
29 |     def column: Column = toColumn(expr)
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/Backticks.scala:
--------------------------------------------------------------------------------
 1 | package uk.co.gresearch.spark
 2 | 
 3 | import org.apache.spark.sql.catalyst.util.QuotingUtils
 4 | 
 5 | /*
 6 |  * Copyright 2021 G-Research
 7 |  *
 8 |  * Licensed under the Apache License, Version 2.0 (the "License");
 9 |  * you may not use this file except in compliance with the License.
10 |  * You may obtain a copy of the License at
11 |  *
12 |  *      http://www.apache.org/licenses/LICENSE-2.0
13 |  *
14 |  * Unless required by applicable law or agreed to in writing, software
15 |  * distributed under the License is distributed on an "AS IS" BASIS,
16 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 |  * See the License for the specific language governing permissions and
18 |  * limitations under the License.
19 |  */
20 | 
21 | object Backticks {
22 | 
23 |   /**
24 |    * Encloses the given strings with backticks (backquotes) if needed.
25 |    *
26 |    * Backticks are not needed for strings that start with a letter (`a`-`z` and `A`-`Z`) or an underscore,
27 |    * and contain only letters, numbers and underscores.
28 |    *
29 |    * Multiple strings will be enclosed individually and concatenated with dots (`.`).
30 |    *
31 |    * This is useful when referencing column names that contain special characters like dots (`.`) or backquotes.
32 |    *
33 |    * Examples:
34 |    * {{{
35 |    *   col("a.column")                                    // this references the field "column" of column "a"
36 |    *   col("`a.column`")                                  // this reference the column with the name "a.column"
37 |    *   col(Backticks.column_name("column"))               // produces "column"
38 |    *   col(Backticks.column_name("a.column"))             // produces "`a.column`"
39 |    *   col(Backticks.column_name("a column"))             // produces "`a column`"
40 |    *   col(Backticks.column_name("`a.column`"))           // produces "`a.column`"
41 |    *   col(Backticks.column_name("a.column", "a.field"))  // produces "`a.column`.`a.field`"
42 |    * }}}
43 |    *
44 |    * @param string
45 |    *   a string
46 |    * @param strings
47 |    *   more strings
48 |    * @return
49 |    */
50 |   @scala.annotation.varargs
51 |   def column_name(string: String, strings: String*): String =
52 |     QuotingUtils.quoted(Array.from(string +: strings))
53 | 
54 | }
55 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/UnixMicros.scala:
--------------------------------------------------------------------------------
1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.5/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala:
--------------------------------------------------------------------------------
1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala


--------------------------------------------------------------------------------
/src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/SplitFile.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.parquet
18 | 
19 | import org.apache.spark.sql.execution.datasources.PartitionedFile
20 | 
21 | case class SplitFile(filePath: String, start: Long, length: Long, fileSize: Option[Long])
22 | 
23 | object SplitFile {
24 |   def apply(file: PartitionedFile): SplitFile = SplitFile(file.filePath.toString, file.start, file.length, Some(file.fileSize))
25 | }
26 | 


--------------------------------------------------------------------------------
/src/main/scala-spark-4.1:
--------------------------------------------------------------------------------
1 | scala-spark-4.0


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/package.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2020 G-Research
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package uk.co
 18 | 
 19 | package object gresearch {
 20 | 
 21 |   trait ConditionalCall[T] {
 22 |     def call(f: T => T): T
 23 |     def either[R](f: T => R): ConditionalCallOr[T, R]
 24 |   }
 25 | 
 26 |   trait ConditionalCallOr[T, R] {
 27 |     def or(f: T => R): R
 28 |   }
 29 | 
 30 |   case class TrueCall[T](t: T) extends ConditionalCall[T] {
 31 |     override def call(f: T => T): T = f(t)
 32 |     override def either[R](f: T => R): ConditionalCallOr[T, R] = TrueCallOr[T, R](f(t))
 33 |   }
 34 | 
 35 |   case class FalseCall[T](t: T) extends ConditionalCall[T] {
 36 |     override def call(f: T => T): T = t
 37 |     override def either[R](f: T => R): ConditionalCallOr[T, R] = FalseCallOr[T, R](t)
 38 |   }
 39 | 
 40 |   case class TrueCallOr[T, R](r: R) extends ConditionalCallOr[T, R] {
 41 |     override def or(f: T => R): R = r
 42 |   }
 43 | 
 44 |   case class FalseCallOr[T, R](t: T) extends ConditionalCallOr[T, R] {
 45 |     override def or(f: T => R): R = f(t)
 46 |   }
 47 | 
 48 |   implicit class ExtendedAny[T](t: T) {
 49 | 
 50 |     /**
 51 |      * Allows to call a function on the decorated instance conditionally.
 52 |      *
 53 |      * This allows fluent code like
 54 |      *
 55 |      * {{{
 56 |      * i.doThis()
 57 |      *  .doThat()
 58 |      *  .on(condition).call(function)
 59 |      *  .on(condition).either(function1).or(function2)
 60 |      *  .doMore()
 61 |      * }}}
 62 |      *
 63 |      * rather than
 64 |      *
 65 |      * {{{
 66 |      * val temp = i.doThis()
 67 |      *             .doThat()
 68 |      * val temp2 = if (condition) function(temp) else temp
 69 |      * temp2.doMore()
 70 |      * }}}
 71 |      *
 72 |      * which either needs many temporary variables or duplicate code.
 73 |      *
 74 |      * @param condition
 75 |      *   condition
 76 |      * @return
 77 |      *   the function result
 78 |      */
 79 |     def on(condition: Boolean): ConditionalCall[T] = {
 80 |       if (condition) TrueCall[T](t) else FalseCall[T](t)
 81 |     }
 82 | 
 83 |     /**
 84 |      * Allows to call a function on the decorated instance conditionally. This is an alias for the `on` function.
 85 |      *
 86 |      * This allows fluent code like
 87 |      *
 88 |      * {{{
 89 |      * i.doThis()
 90 |      *  .doThat()
 91 |      *  .when(condition).call(function)
 92 |      *  .when(condition).either(function1).or(function2)
 93 |      *  .doMore()
 94 |      *
 95 |      *
 96 |      * rather than
 97 |      *
 98 |      * {{{
 99 |      * val temp = i.doThis()
100 |      *             .doThat()
101 |      * val temp2 = if (condition) function(temp) else temp
102 |      * temp2.doMore()
103 |      * }}}
104 |      *
105 |      * which either needs many temporary variables or duplicate code.
106 |      *
107 |      * @param condition
108 |      *   condition
109 |      * @return
110 |      *   the function result
111 |      */
112 |     def when(condition: Boolean): ConditionalCall[T] = on(condition)
113 | 
114 |     /**
115 |      * Executes the given function on the decorated instance.
116 |      *
117 |      * This allows writing fluent code like
118 |      *
119 |      * {{{
120 |      * i.doThis()
121 |      *  .doThat()
122 |      *  .call(function)
123 |      *  .doMore()
124 |      * }}}
125 |      *
126 |      * rather than
127 |      *
128 |      * {{{
129 |      * function(
130 |      *   i.doThis()
131 |      *    .doThat()
132 |      * ).doMore()
133 |      * }}}
134 |      *
135 |      * where the effective sequence of operations is not clear.
136 |      *
137 |      * @param f
138 |      *   function
139 |      * @return
140 |      *   the function result
141 |      */
142 |     def call[R](f: T => R): R = f(t)
143 |   }
144 | 
145 | }
146 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/BuildVersion.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import java.util.Properties
20 | 
21 | /**
22 |  * Provides versions from build environment.
23 |  */
24 | trait BuildVersion {
25 |   val propertyFileName = "spark-extension-build.properties"
26 | 
27 |   lazy val props: Properties = {
28 |     val properties = new Properties
29 | 
30 |     val in = Option(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertyFileName))
31 |     if (in.isEmpty) {
32 |       throw new RuntimeException(s"Property file $propertyFileName not found in class path")
33 |     }
34 | 
35 |     in.foreach(properties.load)
36 |     properties
37 |   }
38 | 
39 |   lazy val VersionString: String = props.getProperty("project.version")
40 | 
41 |   lazy val BuildSparkMajorVersion: Int = props.getProperty("spark.major.version").toInt
42 |   lazy val BuildSparkMinorVersion: Int = props.getProperty("spark.minor.version").toInt
43 |   lazy val BuildSparkPatchVersion: Int = props.getProperty("spark.patch.version").split("-").head.toInt
44 |   lazy val BuildSparkCompatVersionString: String = props.getProperty("spark.compat.version")
45 | 
46 |   lazy val BuildScalaMajorVersion: Int = props.getProperty("scala.major.version").toInt
47 |   lazy val BuildScalaMinorVersion: Int = props.getProperty("scala.minor.version").toInt
48 |   lazy val BuildScalaPatchVersion: Int = props.getProperty("scala.patch.version").toInt
49 |   lazy val BuildScalaCompatVersionString: String = props.getProperty("scala.compat.version")
50 | 
51 |   val BuildSparkVersion: (Int, Int, Int) = (BuildSparkMajorVersion, BuildSparkMinorVersion, BuildSparkPatchVersion)
52 |   val BuildSparkCompatVersion: (Int, Int) = (BuildSparkMajorVersion, BuildSparkMinorVersion)
53 | 
54 |   val BuildScalaVersion: (Int, Int, Int) = (BuildScalaMajorVersion, BuildScalaMinorVersion, BuildScalaPatchVersion)
55 |   val BuildScalaCompatVersion: (Int, Int) = (BuildScalaMajorVersion, BuildScalaMinorVersion)
56 | }
57 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/Histogram.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.functions.{sum, when}
20 | import org.apache.spark.sql.{Column, DataFrame, Dataset}
21 | import uk.co.gresearch.ExtendedAny
22 | 
23 | import scala.collection.JavaConverters
24 | 
25 | object Histogram {
26 | 
27 |   /**
28 |    * Compute the histogram of a column when aggregated by aggregate columns. Thresholds are expected to be provided in
29 |    * ascending order. The result dataframe contains the aggregate and histogram columns only. For each threshold value
30 |    * in thresholds, there will be a column named s"≤threshold". There will also be a final column called
31 |    * s">last_threshold", that counts the remaining values that exceed the last threshold.
32 |    *
33 |    * @param df
34 |    *   dataset to compute histogram from
35 |    * @param thresholds
36 |    *   sequence of thresholds in ascending order, must implement <= and > operators w.r.t. valueColumn
37 |    * @param valueColumn
38 |    *   histogram is computed for values of this column
39 |    * @param aggregateColumns
40 |    *   histogram is computed against these columns
41 |    * @tparam T
42 |    *   type of histogram thresholds
43 |    * @return
44 |    *   dataframe with aggregate and histogram columns
45 |    */
46 |   def of[D, T](df: Dataset[D], thresholds: Seq[T], valueColumn: Column, aggregateColumns: Column*): DataFrame = {
47 |     if (thresholds.isEmpty)
48 |       throw new IllegalArgumentException("Thresholds must not be empty")
49 | 
50 |     val bins = if (thresholds.length == 1) Seq.empty else thresholds.sliding(2).toSeq
51 | 
52 |     if (bins.exists(s => s.head == s.last))
53 |       throw new IllegalArgumentException(s"Thresholds must not contain duplicates: ${thresholds.mkString(",")}")
54 | 
55 |     df.toDF()
56 |       .withColumn(s"≤${thresholds.head}", when(valueColumn <= thresholds.head, 1).otherwise(0))
57 |       .call(bins.foldLeft(_) { case (df, bin) =>
58 |         df.withColumn(s"≤${bin.last}", when(valueColumn > bin.head && valueColumn <= bin.last, 1).otherwise(0))
59 |       })
60 |       .withColumn(s">${thresholds.last}", when(valueColumn > thresholds.last, 1).otherwise(0))
61 |       .groupBy(aggregateColumns: _*)
62 |       .agg(
63 |         Some(thresholds.head).map(t => sum(backticks(s"≤$t")).as(s"≤$t")).get,
64 |         thresholds.tail.map(t => sum(backticks(s"≤$t")).as(s"≤$t")) :+
65 |           sum(backticks(s">${thresholds.last}")).as(s">${thresholds.last}"): _*
66 |       )
67 |   }
68 | 
69 |   /**
70 |    * Compute the histogram of a column when aggregated by aggregate columns. Thresholds are expected to be provided in
71 |    * ascending order. The result dataframe contains the aggregate and histogram columns only. For each threshold value
72 |    * in thresholds, there will be a column named s"≤threshold". There will also be a final column called
73 |    * s">last_threshold", that counts the remaining values that exceed the last threshold.
74 |    *
75 |    * @param df
76 |    *   dataset to compute histogram from
77 |    * @param thresholds
78 |    *   sequence of thresholds in ascending order, must implement <= and > operators w.r.t. valueColumn
79 |    * @param valueColumn
80 |    *   histogram is computed for values of this column
81 |    * @param aggregateColumns
82 |    *   histogram is computed against these columns
83 |    * @tparam T
84 |    *   type of histogram thresholds
85 |    * @return
86 |    *   dataframe with aggregate and histogram columns
87 |    */
88 |   @scala.annotation.varargs
89 |   def of[D, T](
90 |       df: Dataset[D],
91 |       thresholds: java.util.List[T],
92 |       valueColumn: Column,
93 |       aggregateColumns: Column*
94 |   ): DataFrame =
95 |     of(df, JavaConverters.iterableAsScalaIterable(thresholds).toSeq, valueColumn, aggregateColumns: _*)
96 | 
97 | }
98 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/RowNumbers.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2023 G-Research
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package uk.co.gresearch.spark
 18 | 
 19 | import org.apache.spark.sql.expressions.Window
 20 | import org.apache.spark.sql.{Column, DataFrame, Dataset, functions}
 21 | import org.apache.spark.sql.functions.{coalesce, col, lit, max, monotonically_increasing_id, spark_partition_id, sum}
 22 | import org.apache.spark.storage.StorageLevel
 23 | 
 24 | case class RowNumbersFunc(
 25 |     rowNumberColumnName: String = "row_number",
 26 |     storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK,
 27 |     unpersistHandle: UnpersistHandle = UnpersistHandle.Noop,
 28 |     orderColumns: Seq[Column] = Seq.empty
 29 | ) {
 30 | 
 31 |   def withRowNumberColumnName(rowNumberColumnName: String): RowNumbersFunc =
 32 |     this.copy(rowNumberColumnName = rowNumberColumnName)
 33 | 
 34 |   def withStorageLevel(storageLevel: StorageLevel): RowNumbersFunc =
 35 |     this.copy(storageLevel = storageLevel)
 36 | 
 37 |   def withUnpersistHandle(unpersistHandle: UnpersistHandle): RowNumbersFunc =
 38 |     this.copy(unpersistHandle = unpersistHandle)
 39 | 
 40 |   def withOrderColumns(orderColumns: Seq[Column]): RowNumbersFunc =
 41 |     this.copy(orderColumns = orderColumns)
 42 | 
 43 |   def of[D](df: Dataset[D]): DataFrame = {
 44 |     if (
 45 |       storageLevel.equals(
 46 |         StorageLevel.NONE
 47 |       ) && (SparkMajorVersion > 3 || SparkMajorVersion == 3 && SparkMinorVersion >= 5)
 48 |     ) {
 49 |       throw new IllegalArgumentException(s"Storage level $storageLevel not supported with Spark 3.5.0 and above.")
 50 |     }
 51 | 
 52 |     // define some column names that do not exist in ds
 53 |     val prefix = distinctPrefixFor(df.columns)
 54 |     val monoIdColumnName = prefix + "mono_id"
 55 |     val partitionIdColumnName = prefix + "partition_id"
 56 |     val localRowNumberColumnName = prefix + "local_row_number"
 57 |     val maxLocalRowNumberColumnName = prefix + "max_local_row_number"
 58 |     val cumRowNumbersColumnName = prefix + "cum_row_numbers"
 59 |     val partitionOffsetColumnName = prefix + "partition_offset"
 60 | 
 61 |     // if no order is given, we preserve existing order
 62 |     val dfOrdered =
 63 |       if (orderColumns.isEmpty) df.withColumn(monoIdColumnName, monotonically_increasing_id())
 64 |       else df.orderBy(orderColumns: _*)
 65 |     val order = if (orderColumns.isEmpty) Seq(col(monoIdColumnName)) else orderColumns
 66 | 
 67 |     // add partition ids and local row numbers
 68 |     val localRowNumberWindow = Window.partitionBy(partitionIdColumnName).orderBy(order: _*)
 69 |     val dfWithPartitionId = dfOrdered
 70 |       .withColumn(partitionIdColumnName, spark_partition_id())
 71 |       .persist(storageLevel)
 72 |     unpersistHandle.setDataFrame(dfWithPartitionId)
 73 |     val dfWithLocalRowNumbers = dfWithPartitionId
 74 |       .withColumn(localRowNumberColumnName, functions.row_number().over(localRowNumberWindow))
 75 | 
 76 |     // compute row offset for the partitions
 77 |     val cumRowNumbersWindow = Window
 78 |       .orderBy(partitionIdColumnName)
 79 |       .rowsBetween(Window.unboundedPreceding, Window.currentRow)
 80 |     val partitionOffsets = dfWithLocalRowNumbers
 81 |       .groupBy(partitionIdColumnName)
 82 |       .agg(max(localRowNumberColumnName).alias(maxLocalRowNumberColumnName))
 83 |       .withColumn(cumRowNumbersColumnName, sum(maxLocalRowNumberColumnName).over(cumRowNumbersWindow))
 84 |       .select(
 85 |         col(partitionIdColumnName) + 1 as partitionIdColumnName,
 86 |         col(cumRowNumbersColumnName).as(partitionOffsetColumnName)
 87 |       )
 88 | 
 89 |     // compute global row number by adding local row number with partition offset
 90 |     val partitionOffsetColumn = coalesce(col(partitionOffsetColumnName), lit(0))
 91 |     dfWithLocalRowNumbers
 92 |       .join(partitionOffsets, Seq(partitionIdColumnName), "left")
 93 |       .withColumn(rowNumberColumnName, col(localRowNumberColumnName) + partitionOffsetColumn)
 94 |       .drop(monoIdColumnName, partitionIdColumnName, localRowNumberColumnName, partitionOffsetColumnName)
 95 |   }
 96 | 
 97 | }
 98 | 
 99 | object RowNumbers {
100 |   def default(): RowNumbersFunc = RowNumbersFunc()
101 | 
102 |   def withRowNumberColumnName(rowNumberColumnName: String): RowNumbersFunc =
103 |     default().withRowNumberColumnName(rowNumberColumnName)
104 | 
105 |   def withStorageLevel(storageLevel: StorageLevel): RowNumbersFunc =
106 |     default().withStorageLevel(storageLevel)
107 | 
108 |   def withUnpersistHandle(unpersistHandle: UnpersistHandle): RowNumbersFunc =
109 |     default().withUnpersistHandle(unpersistHandle)
110 | 
111 |   @scala.annotation.varargs
112 |   def withOrderColumns(orderColumns: Column*): RowNumbersFunc =
113 |     default().withOrderColumns(orderColumns)
114 | 
115 |   def of[D](ds: Dataset[D]): DataFrame = default().of(ds)
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/SparkVersion.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.SPARK_VERSION_SHORT
20 | 
21 | /**
22 |  * Provides versions form runtime environment.
23 |  */
24 | trait SparkVersion {
25 |   private def SparkVersionSeq: Seq[Int] = SPARK_VERSION_SHORT.split('.').toSeq.map(_.toInt)
26 | 
27 |   def SparkMajorVersion: Int = SparkVersionSeq.head
28 |   def SparkMinorVersion: Int = SparkVersionSeq(1)
29 |   def SparkPatchVersion: Int = SparkVersionSeq(2)
30 | 
31 |   def SparkVersion: (Int, Int, Int) = (SparkMajorVersion, SparkMinorVersion, SparkPatchVersion)
32 |   def SparkCompatVersion: (Int, Int) = (SparkMajorVersion, SparkMinorVersion)
33 |   def SparkCompatVersionString: String = SparkVersionSeq.slice(0, 2).mkString(".")
34 | }
35 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/UnpersistHandle.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.DataFrame
20 | 
21 | /**
22 |  * Handle to call `DataFrame.unpersist` on a `DataFrame` that is not known to the caller. The [[RowNumbers.of]]
23 |  * constructs a `DataFrame` that is based ony an intermediate cached `DataFrame`, for witch `unpersist` must be called.
24 |  * A provided [[UnpersistHandle]] allows to do that in user code.
25 |  */
26 | class UnpersistHandle {
27 |   var df: Option[DataFrame] = None
28 | 
29 |   private[spark] def setDataFrame(dataframe: DataFrame): DataFrame = {
30 |     if (df.isDefined) throw new IllegalStateException("DataFrame has been set already, it cannot be reused.")
31 |     this.df = Some(dataframe)
32 |     dataframe
33 |   }
34 | 
35 |   def apply(): Unit = {
36 |     this.df.getOrElse(throw new IllegalStateException("DataFrame has to be set first")).unpersist()
37 |   }
38 | 
39 |   def apply(blocking: Boolean): Unit = {
40 |     this.df.getOrElse(throw new IllegalStateException("DataFrame has to be set first")).unpersist(blocking)
41 |   }
42 | }
43 | 
44 | case class SilentUnpersistHandle() extends UnpersistHandle {
45 |   override def apply(): Unit = {
46 |     this.df.foreach(_.unpersist())
47 |   }
48 | 
49 |   override def apply(blocking: Boolean): Unit = {
50 |     this.df.foreach(_.unpersist(blocking))
51 |   }
52 | }
53 | 
54 | case class NoopUnpersistHandle() extends UnpersistHandle {
55 |   override def setDataFrame(dataframe: DataFrame): DataFrame = dataframe
56 |   override def apply(): Unit = {}
57 |   override def apply(blocking: Boolean): Unit = {}
58 | }
59 | 
60 | object UnpersistHandle {
61 |   val Noop: NoopUnpersistHandle = NoopUnpersistHandle()
62 |   def apply(): UnpersistHandle = new UnpersistHandle()
63 | 
64 |   def withUnpersist[T](blocking: Boolean = false)(func: UnpersistHandle => T): T = {
65 |     val handle = SilentUnpersistHandle()
66 |     try {
67 |       func(handle)
68 |     } finally {
69 |       handle(blocking)
70 |     }
71 |   }
72 | }
73 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/DiffComparators.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 G-Research
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package uk.co.gresearch.spark.diff
 18 | 
 19 | import org.apache.spark.sql.Encoder
 20 | import org.apache.spark.sql.types.DataType
 21 | import uk.co.gresearch.spark.diff.comparator._
 22 | 
 23 | import java.time.Duration
 24 | 
 25 | object DiffComparators {
 26 | 
 27 |   /**
 28 |    * The default comparator used in [[DiffOptions.default.defaultComparator]].
 29 |    */
 30 |   def default(): DiffComparator = DefaultDiffComparator
 31 | 
 32 |   /**
 33 |    * A comparator equivalent to `Column <=> Column`. Null values are considered equal.
 34 |    */
 35 |   def nullSafeEqual(): DiffComparator = NullSafeEqualDiffComparator
 36 | 
 37 |   /**
 38 |    * Return a comparator that uses the given [[math.Equiv]] to compare values of type [[T]]. The implicit [[Encoder]] of
 39 |    * type [[T]] determines the input data type of the comparator. Only columns of that type can be compared.
 40 |    */
 41 |   def equiv[T: Encoder](equiv: math.Equiv[T]): EquivDiffComparator[T] = EquivDiffComparator(equiv)
 42 | 
 43 |   /**
 44 |    * Return a comparator that uses the given [[math.Equiv]] to compare values of type [[T]]. Only columns of the given
 45 |    * data type `inputType` can be compared.
 46 |    */
 47 |   def equiv[T](equiv: math.Equiv[T], inputType: DataType): EquivDiffComparator[T] =
 48 |     EquivDiffComparator(equiv, inputType)
 49 | 
 50 |   /**
 51 |    * Return a comparator that uses the given [[math.Equiv]] to compare values of any type.
 52 |    */
 53 |   def equiv(equiv: math.Equiv[Any]): EquivDiffComparator[Any] = EquivDiffComparator(equiv)
 54 | 
 55 |   /**
 56 |    * This comparator considers values equal when they are less than `epsilon` apart. It can be configured to use
 57 |    * `epsilon` as an absolute (`.asAbsolute()`) threshold, or as relative (`.asRelative()`) to the larger value.
 58 |    * Further, the threshold itself can be considered equal (`.asInclusive()`) or not equal (`.asExclusive()`):
 59 |    *
 60 |    * <ul> <li>`DiffComparator.epsilon(epsilon).asAbsolute().asInclusive()`: `abs(left - right) ≤ epsilon`</li>
 61 |    * <li>`DiffComparator.epsilon(epsilon).asAbsolute().asExclusive()`: `abs(left - right) < epsilon`</li>
 62 |    * <li>`DiffComparator.epsilon(epsilon).asRelative().asInclusive()`: `abs(left - right) ≤ epsilon * max(abs(left),
 63 |    * abs(right))`</li> <li>`DiffComparator.epsilon(epsilon).asRelative().asExclusive()`: `abs(left - right) < epsilon *
 64 |    * max(abs(left), abs(right))`</li> </ul>
 65 |    *
 66 |    * Requires compared column types to implement `-`, `*`, `<`, `==`, and `abs`.
 67 |    */
 68 |   def epsilon(epsilon: Double): EpsilonDiffComparator = EpsilonDiffComparator(epsilon)
 69 | 
 70 |   /**
 71 |    * A comparator for string values.
 72 |    *
 73 |    * With `whitespaceAgnostic` set `true`, differences in white spaces are ignored. This ignores leading and trailing
 74 |    * whitespaces as well. With `whitespaceAgnostic` set `false`, this is equal to the default string comparison (see
 75 |    * [[default()]]).
 76 |    */
 77 |   def string(whitespaceAgnostic: Boolean = true): StringDiffComparator =
 78 |     if (whitespaceAgnostic) {
 79 |       WhitespaceDiffComparator
 80 |     } else {
 81 |       StringDiffComparator
 82 |     }
 83 | 
 84 |   /**
 85 |    * This comparator considers two `DateType` or `TimestampType` values equal when they are at most `duration` apart.
 86 |    * Duration is an instance of `java.time.Duration`.
 87 |    *
 88 |    * The comparator can be configured to consider `duration` as equal (`.asInclusive()`) or not equal
 89 |    * (`.asExclusive()`): <ul> <li>`DiffComparator.duration(duration).asInclusive()`: `left - right ≤ duration`</li>
 90 |    * <li>`DiffComparator.duration(duration).asExclusive()`: `left - right < duration`</li> </lu>
 91 |    */
 92 |   def duration(duration: Duration): DurationDiffComparator = DurationDiffComparator(duration)
 93 | 
 94 |   /**
 95 |    * This comparator compares two `Map[K,V]` values. They are equal when they match in all their keys and values.
 96 |    */
 97 |   def map[K: Encoder, V: Encoder](): DiffComparator = MapDiffComparator[K, V](keyOrderSensitive = false)
 98 | 
 99 |   /**
100 |    * This comparator compares two `Map[keyType,valueType]` values. They are equal when they match in all their keys and
101 |    * values.
102 |    */
103 |   def map(keyType: DataType, valueType: DataType, keyOrderSensitive: Boolean = false): DiffComparator =
104 |     MapDiffComparator(keyType, valueType, keyOrderSensitive)
105 | 
106 |   // for backward compatibility to v2.4.0 up to v2.8.0
107 |   // replace with default value in above map when moving to v3
108 |   /**
109 |    * This comparator compares two `Map[K,V]` values. They are equal when they match in all their keys and values.
110 |    *
111 |    * @param keyOrderSensitive
112 |    *   comparator compares key order if true
113 |    */
114 |   def map[K: Encoder, V: Encoder](keyOrderSensitive: Boolean): DiffComparator =
115 |     MapDiffComparator[K, V](keyOrderSensitive)
116 | }
117 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/DefaultDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.Column
20 | 
21 | case object DefaultDiffComparator extends DiffComparator {
22 |   override def equiv(left: Column, right: Column): Column = NullSafeEqualDiffComparator.equiv(left, right)
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/DiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.Column
20 | 
21 | trait DiffComparator {
22 |   def equiv(left: Column, right: Column): Column
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/DurationDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.functions.abs
21 | import uk.co.gresearch.spark
22 | import uk.co.gresearch.spark.SparkVersion
23 | import uk.co.gresearch.spark.diff.comparator.DurationDiffComparator.isNotSupportedBySpark
24 | 
25 | import java.time.Duration
26 | 
27 | /**
28 |  * Compares two timestamps and considers them equal when they are less than (or equal to when inclusive = true) a given
29 |  * duration apart.
30 |  *
31 |  * @param duration
32 |  *   equality threshold
33 |  * @param inclusive
34 |  *   duration is considered equal when true
35 |  */
36 | case class DurationDiffComparator(duration: Duration, inclusive: Boolean = true) extends DiffComparator {
37 |   if (isNotSupportedBySpark) {
38 |     throw new UnsupportedOperationException(
39 |       s"java.time.Duration is not supported by Spark ${spark.SparkCompatVersionString}"
40 |     )
41 |   }
42 | 
43 |   override def equiv(left: Column, right: Column): Column = {
44 |     val inDuration =
45 |       if (inclusive)
46 |         (diff: Column) => diff <= duration
47 |       else
48 |         (diff: Column) => diff < duration
49 | 
50 |     left.isNull && right.isNull ||
51 |     left.isNotNull && right.isNotNull && inDuration(abs(left - right))
52 |   }
53 | 
54 |   def asInclusive(): DurationDiffComparator = if (inclusive) this else copy(inclusive = true)
55 |   def asExclusive(): DurationDiffComparator = if (inclusive) copy(inclusive = false) else this
56 | }
57 | 
58 | object DurationDiffComparator extends SparkVersion {
59 |   val isSupportedBySpark: Boolean = SparkMajorVersion == 3 && SparkMinorVersion >= 3 || SparkMajorVersion > 3
60 |   val isNotSupportedBySpark: Boolean = !isSupportedBySpark
61 | }
62 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/EpsilonDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.functions.{abs, greatest}
21 | 
22 | case class EpsilonDiffComparator(epsilon: Double, relative: Boolean = true, inclusive: Boolean = true)
23 |     extends DiffComparator {
24 |   override def equiv(left: Column, right: Column): Column = {
25 |     val threshold =
26 |       if (relative)
27 |         greatest(abs(left), abs(right)) * epsilon
28 |       else
29 |         epsilon
30 | 
31 |     val inEpsilon =
32 |       if (inclusive)
33 |         (diff: Column) => diff <= threshold
34 |       else
35 |         (diff: Column) => diff < threshold
36 | 
37 |     left.isNull && right.isNull || left.isNotNull && right.isNotNull && inEpsilon(abs(left - right))
38 |   }
39 | 
40 |   def asAbsolute(): EpsilonDiffComparator = if (relative) copy(relative = false) else this
41 |   def asRelative(): EpsilonDiffComparator = if (relative) this else copy(relative = true)
42 | 
43 |   def asInclusive(): EpsilonDiffComparator = if (inclusive) this else copy(inclusive = true)
44 |   def asExclusive(): EpsilonDiffComparator = if (inclusive) copy(inclusive = false) else this
45 | }
46 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/EquivDiffComparator.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 G-Research
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package uk.co.gresearch.spark.diff.comparator
 18 | 
 19 | import org.apache.spark.sql.catalyst.InternalRow
 20 | import org.apache.spark.sql.catalyst.encoders.encoderFor
 21 | import org.apache.spark.sql.catalyst.expressions.codegen.Block.BlockHelper
 22 | import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode, FalseLiteral}
 23 | import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, BinaryOperator, Expression}
 24 | import org.apache.spark.sql.extension.{ColumnExtension, ExpressionExtension}
 25 | import org.apache.spark.sql.types.{BooleanType, DataType}
 26 | import org.apache.spark.sql.{Column, Encoder}
 27 | import uk.co.gresearch.spark.BinaryLikeWithNewChildrenInternal
 28 | 
 29 | trait EquivDiffComparator[T] extends DiffComparator {
 30 |   val equiv: math.Equiv[T]
 31 | }
 32 | 
 33 | private trait ExpressionEquivDiffComparator[T] extends EquivDiffComparator[T] {
 34 |   def equiv(left: Expression, right: Expression): EquivExpression[T]
 35 |   def equiv(left: Column, right: Column): Column = equiv(left.expr, right.expr).column
 36 | }
 37 | 
 38 | trait TypedEquivDiffComparator[T] extends EquivDiffComparator[T] with TypedDiffComparator
 39 | 
 40 | private[comparator] trait TypedEquivDiffComparatorWithInput[T]
 41 |     extends ExpressionEquivDiffComparator[T]
 42 |     with TypedEquivDiffComparator[T] {
 43 |   def equiv(left: Expression, right: Expression): Equiv[T] = Equiv(left, right, equiv, inputType)
 44 | }
 45 | 
 46 | private[comparator] case class InputTypedEquivDiffComparator[T](equiv: math.Equiv[T], inputType: DataType)
 47 |     extends TypedEquivDiffComparatorWithInput[T]
 48 | 
 49 | object EquivDiffComparator {
 50 |   def apply[T: Encoder](equiv: math.Equiv[T]): TypedEquivDiffComparator[T] = EncoderEquivDiffComparator(equiv)
 51 |   def apply[T](equiv: math.Equiv[T], inputType: DataType): TypedEquivDiffComparator[T] =
 52 |     InputTypedEquivDiffComparator(equiv, inputType)
 53 |   def apply(equiv: math.Equiv[Any]): EquivDiffComparator[Any] = EquivAnyDiffComparator(equiv)
 54 | 
 55 |   private case class EncoderEquivDiffComparator[T: Encoder](equiv: math.Equiv[T])
 56 |       extends ExpressionEquivDiffComparator[T]
 57 |       with TypedEquivDiffComparator[T] {
 58 |     override def inputType: DataType = encoderFor[T].schema.fields(0).dataType
 59 |     def equiv(left: Expression, right: Expression): Equiv[T] = Equiv(left, right, equiv, inputType)
 60 |   }
 61 | 
 62 |   private case class EquivAnyDiffComparator(equiv: math.Equiv[Any]) extends ExpressionEquivDiffComparator[Any] {
 63 |     def equiv(left: Expression, right: Expression): EquivExpression[Any] = EquivAny(left, right, equiv)
 64 |   }
 65 | }
 66 | 
 67 | private trait EquivExpression[T] extends BinaryExpression with BinaryLikeWithNewChildrenInternal[Expression] {
 68 |   val equiv: math.Equiv[T]
 69 | 
 70 |   override def nullable: Boolean = false
 71 | 
 72 |   override def dataType: DataType = BooleanType
 73 | 
 74 |   override def eval(input: InternalRow): Any = {
 75 |     val input1 = left.eval(input).asInstanceOf[T]
 76 |     val input2 = right.eval(input).asInstanceOf[T]
 77 |     if (input1 == null && input2 == null) {
 78 |       true
 79 |     } else if (input1 == null || input2 == null) {
 80 |       false
 81 |     } else {
 82 |       equiv.equiv(input1, input2)
 83 |     }
 84 |   }
 85 | 
 86 |   override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
 87 |     val eval1 = left.genCode(ctx)
 88 |     val eval2 = right.genCode(ctx)
 89 |     val equivRef = ctx.addReferenceObj("equiv", equiv, math.Equiv.getClass.getName.stripSuffix("$"))
 90 |     ev.copy(
 91 |       code = eval1.code + eval2.code + code"""
 92 |         boolean ${ev.value} = (${eval1.isNull} && ${eval2.isNull}) ||
 93 |            (!${eval1.isNull} && !${eval2.isNull} && $equivRef.equiv(${eval1.value}, ${eval2.value}));""",
 94 |       isNull = FalseLiteral
 95 |     )
 96 |   }
 97 | }
 98 | 
 99 | private trait EquivOperator[T] extends BinaryOperator with EquivExpression[T] {
100 |   val equivInputType: DataType
101 | 
102 |   override def inputType: DataType = equivInputType
103 | 
104 |   override def symbol: String = "≡"
105 | }
106 | 
107 | private case class Equiv[T](left: Expression, right: Expression, equiv: math.Equiv[T], equivInputType: DataType)
108 |     extends EquivOperator[T] {
109 |   override protected def withNewChildrenInternal(newLeft: Expression, newRight: Expression): Equiv[T] =
110 |     copy(left = newLeft, right = newRight)
111 | }
112 | 
113 | private case class EquivAny(left: Expression, right: Expression, equiv: math.Equiv[Any]) extends EquivExpression[Any] {
114 | 
115 |   override protected def withNewChildrenInternal(newLeft: Expression, newRight: Expression): EquivAny =
116 |     copy(left = newLeft, right = newRight)
117 | }
118 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/MapDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.catalyst.encoders.encoderFor
20 | import org.apache.spark.sql.catalyst.expressions.UnsafeMapData
21 | import org.apache.spark.sql.types.{DataType, MapType}
22 | import org.apache.spark.sql.{Column, Encoder}
23 | 
24 | import scala.reflect.ClassTag
25 | 
26 | case class MapDiffComparator[K, V](private val comparator: EquivDiffComparator[UnsafeMapData]) extends DiffComparator {
27 |   override def equiv(left: Column, right: Column): Column = comparator.equiv(left, right)
28 | }
29 | 
30 | private case class MapDiffEquiv[K: ClassTag, V](keyType: DataType, valueType: DataType, keyOrderSensitive: Boolean)
31 |     extends math.Equiv[UnsafeMapData] {
32 |   override def equiv(left: UnsafeMapData, right: UnsafeMapData): Boolean = {
33 | 
34 |     val leftKeys: Array[K] = left.keyArray().toArray(keyType)
35 |     val rightKeys: Array[K] = right.keyArray().toArray(keyType)
36 | 
37 |     val leftKeysIndices: Map[K, Int] = leftKeys.zipWithIndex.toMap
38 |     val rightKeysIndices: Map[K, Int] = rightKeys.zipWithIndex.toMap
39 | 
40 |     val leftValues = left.valueArray()
41 |     val rightValues = right.valueArray()
42 | 
43 |     // can only be evaluated when right has same keys as left
44 |     lazy val valuesAreEqual = leftKeysIndices
45 |       .map { case (key, index) => index -> rightKeysIndices(key) }
46 |       .map { case (leftIndex, rightIndex) =>
47 |         (leftIndex, rightIndex, leftValues.isNullAt(leftIndex), rightValues.isNullAt(rightIndex))
48 |       }
49 |       .map { case (leftIndex, rightIndex, leftIsNull, rightIsNull) =>
50 |         leftIsNull && rightIsNull ||
51 |         !leftIsNull && !rightIsNull && leftValues
52 |           .get(leftIndex, valueType)
53 |           .equals(rightValues.get(rightIndex, valueType))
54 |       }
55 | 
56 |     left.numElements() == right.numElements() &&
57 |     (keyOrderSensitive && leftKeys
58 |       .sameElements(rightKeys) || !keyOrderSensitive && leftKeys.toSet.diff(rightKeys.toSet).isEmpty) &&
59 |     valuesAreEqual.forall(identity)
60 |   }
61 | }
62 | 
63 | case object MapDiffComparator {
64 |   def apply[K: Encoder, V: Encoder](keyOrderSensitive: Boolean): MapDiffComparator[K, V] = {
65 |     val keyType = encoderFor[K].schema.fields(0).dataType
66 |     val valueType = encoderFor[V].schema.fields(0).dataType
67 |     val equiv = MapDiffEquiv(keyType, valueType, keyOrderSensitive)
68 |     val dataType = MapType(keyType, valueType)
69 |     val comparator = InputTypedEquivDiffComparator[UnsafeMapData](equiv, dataType)
70 |     MapDiffComparator[K, V](comparator)
71 |   }
72 | 
73 |   def apply(keyType: DataType, valueType: DataType, keyOrderSensitive: Boolean): MapDiffComparator[Any, Any] = {
74 |     val equiv = MapDiffEquiv(keyType, valueType, keyOrderSensitive)
75 |     val dataType = MapType(keyType, valueType)
76 |     val comparator = InputTypedEquivDiffComparator[UnsafeMapData](equiv, dataType)
77 |     MapDiffComparator[Any, Any](comparator)
78 |   }
79 | 
80 |   // for backward compatibility to v2.4.0 up to v2.8.0
81 |   // replace with default value in above apply when moving to v3
82 |   def apply[K: Encoder, V: Encoder](): MapDiffComparator[K, V] = apply(keyOrderSensitive = false)
83 | }
84 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/NullSafeEqualDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.Column
20 | 
21 | case object NullSafeEqualDiffComparator extends DiffComparator {
22 |   override def equiv(left: Column, right: Column): Column = left <=> right
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/TypedDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.types.{DataType, StringType}
21 | 
22 | trait TypedDiffComparator extends DiffComparator {
23 |   def inputType: DataType
24 | }
25 | 
26 | trait StringDiffComparator extends TypedDiffComparator {
27 |   override def inputType: DataType = StringType
28 | }
29 | 
30 | case object StringDiffComparator extends StringDiffComparator {
31 |   override def equiv(left: Column, right: Column): Column = DefaultDiffComparator.equiv(left, right)
32 | }
33 | 


--------------------------------------------------------------------------------
/src/main/scala/uk/co/gresearch/spark/diff/comparator/WhitespaceDiffComparator.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2023 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.comparator
18 | 
19 | import org.apache.spark.unsafe.types.UTF8String
20 | 
21 | case object WhitespaceDiffComparator extends TypedEquivDiffComparatorWithInput[UTF8String] with StringDiffComparator {
22 |   override val equiv: scala.Equiv[UTF8String] = (x: UTF8String, y: UTF8String) =>
23 |     x.trimAll()
24 |       .toString
25 |       .replaceAll("\\s+", " ")
26 |       .equals(
27 |         y.trimAll().toString.replaceAll("\\s+", " ")
28 |       )
29 | }
30 | 


--------------------------------------------------------------------------------
/src/test/files/nested.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Research/spark-extension/b5792dea93e964ad48d202a504195b261caaa056/src/test/files/nested.parquet


--------------------------------------------------------------------------------
/src/test/files/test.parquet/file1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Research/spark-extension/b5792dea93e964ad48d202a504195b261caaa056/src/test/files/test.parquet/file1.parquet


--------------------------------------------------------------------------------
/src/test/files/test.parquet/file2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Research/spark-extension/b5792dea93e964ad48d202a504195b261caaa056/src/test/files/test.parquet/file2.parquet


--------------------------------------------------------------------------------
/src/test/java/uk/co/gresearch/test/diff/JavaValue.java:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2021 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff;
18 | 
19 | import java.io.Serializable;
20 | import java.util.Objects;
21 | 
22 | public class JavaValue implements Serializable {
23 |     private Integer id;
24 |     private String label;
25 |     private Double score;
26 | 
27 |     public JavaValue() { }
28 | 
29 |     public JavaValue(Integer id, String label, Double score) {
30 |         this.id = id;
31 |         this.label = label;
32 |         this.score = score;
33 |     }
34 | 
35 |     public Integer getId() {
36 |         return id;
37 |     }
38 | 
39 |     public void setId(Integer id) {
40 |         this.id = id;
41 |     }
42 | 
43 |     public String getLabel() {
44 |         return label;
45 |     }
46 | 
47 |     public void setLabel(String label) {
48 |         this.label = label;
49 |     }
50 | 
51 |     public Double getScore() {
52 |         return score;
53 |     }
54 | 
55 |     public void setScore(Double score) {
56 |         this.score = score;
57 |     }
58 | 
59 |     @Override
60 |     public boolean equals(Object o) {
61 |         if (this == o) return true;
62 |         if (o == null || getClass() != o.getClass()) return false;
63 | 
64 |         JavaValue javaValue = (JavaValue) o;
65 |         return Objects.equals(id, javaValue.id) && Objects.equals(label, javaValue.label) && Objects.equals(score, javaValue.score);
66 |     }
67 | 
68 |     @Override
69 |     public int hashCode() {
70 |         return Objects.hash(id, label, score);
71 |     }
72 | 
73 |     @Override
74 |     public String toString() {
75 |         return "JavaValue{id=" + id + ", label='" + label + "', score=" + score + '}';
76 |     }
77 | }
78 | 


--------------------------------------------------------------------------------
/src/test/java/uk/co/gresearch/test/diff/JavaValueAs.java:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 G-Research
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package uk.co.gresearch.spark.diff;
 18 | 
 19 | import java.io.Serializable;
 20 | import java.util.Objects;
 21 | 
 22 | public class JavaValueAs implements Serializable {
 23 |     private String diff;
 24 |     private Integer id;
 25 |     private String left_label;
 26 |     private String right_label;
 27 |     private Double left_score;
 28 |     private Double right_score;
 29 | 
 30 |     public JavaValueAs() { }
 31 | 
 32 |     public JavaValueAs(String diff, Integer id, String left_label, String right_label, Double left_score, Double right_score) {
 33 |         this.diff = diff;
 34 |         this.id = id;
 35 |         this.left_label = left_label;
 36 |         this.right_label = right_label;
 37 |         this.left_score = left_score;
 38 |         this.right_score = right_score;
 39 |     }
 40 | 
 41 |     public String getDiff() {
 42 |         return diff;
 43 |     }
 44 | 
 45 |     public void setDiff(String diff) {
 46 |         this.diff = diff;
 47 |     }
 48 | 
 49 |     public Integer getId() {
 50 |         return id;
 51 |     }
 52 | 
 53 |     public void setId(Integer id) {
 54 |         this.id = id;
 55 |     }
 56 | 
 57 |     public String getLeft_label() {
 58 |         return left_label;
 59 |     }
 60 | 
 61 |     public void setLeft_label(String left_label) {
 62 |         this.left_label = left_label;
 63 |     }
 64 | 
 65 |     public String getRight_label() {
 66 |         return right_label;
 67 |     }
 68 | 
 69 |     public void setRight_label(String right_label) {
 70 |         this.right_label = right_label;
 71 |     }
 72 | 
 73 |     public Double getLeft_score() {
 74 |         return left_score;
 75 |     }
 76 | 
 77 |     public void setLeft_score(Double left_score) {
 78 |         this.left_score = left_score;
 79 |     }
 80 | 
 81 |     public Double getRight_score() {
 82 |         return right_score;
 83 |     }
 84 | 
 85 |     public void setRight_score(Double right_score) {
 86 |         this.right_score = right_score;
 87 |     }
 88 | 
 89 |     @Override
 90 |     public boolean equals(Object o) {
 91 |         if (this == o) return true;
 92 |         if (o == null || getClass() != o.getClass()) return false;
 93 | 
 94 |         JavaValueAs that = (JavaValueAs) o;
 95 |         return Objects.equals(diff, that.diff) && Objects.equals(id, that.id) && Objects.equals(left_label, that.left_label) && Objects.equals(right_label, that.right_label) && Objects.equals(left_score, that.left_score) && Objects.equals(right_score, that.right_score);
 96 |     }
 97 | 
 98 |     @Override
 99 |     public int hashCode() {
100 |         return Objects.hash(diff, id, left_label, right_label, left_score, right_score);
101 |     }
102 | 
103 |     @Override
104 |     public String toString() {
105 |         return "JavaValueAs{" +
106 |                 "diff='" + diff + "', " +
107 |                 "id=" + id + ", " +
108 |                 "left_label='" + left_label + "', " +
109 |                 "right_label='" + right_label + "', " +
110 |                 "left_score=" + left_score + ", " +
111 |                 "right_score=" + right_score +
112 |                 '}';
113 |     }
114 | }
115 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2020 G-Research
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # Set everything to be logged to the console
18 | log4j.rootCategory=WARN, console
19 | log4j.appender.console=org.apache.log4j.ConsoleAppender
20 | log4j.appender.console.target=System.err
21 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
22 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
23 | 
24 | # Set the default spark-shell log level to WARN. When running the spark-shell, the
25 | # log level for this class is used to overwrite the root logger's log level, so that
26 | # the user can have different defaults for the shell and regular Spark apps.
27 | log4j.logger.org.apache.spark.repl.Main=WARN
28 | 
29 | # Settings to quiet third party logs that are too verbose
30 | log4j.logger.org.sparkproject.jetty=WARN
31 | log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR
32 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
33 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
34 | log4j.logger.org.apache.parquet=ERROR
35 | log4j.logger.parquet=ERROR
36 | 
37 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
38 | log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL
39 | log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR
40 | 
41 | # Set G-Research Spark logging to DEBUG
42 | log4j.logger.uk.co.gresearch.spark=DEBUG
43 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
 1 | #
 2 | # Copyright 2020 G-Research
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #      http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | 
17 | # Set everything to be logged to the console
18 | rootLogger.level = warn
19 | rootLogger.appenderRef.stdout.ref = console
20 | 
21 | # In the pattern layout configuration below, we specify an explicit `%ex` conversion
22 | # pattern for logging Throwables. If this was omitted, then (by default) Log4J would
23 | # implicitly add an `%xEx` conversion pattern which logs stacktraces with additional
24 | # class packaging information. That extra information can sometimes add a substantial
25 | # performance overhead, so we disable it in our default logging config.
26 | # For more information, see SPARK-39361.
27 | appender.console.type = Console
28 | appender.console.name = console
29 | appender.console.target = SYSTEM_ERR
30 | appender.console.layout.type = PatternLayout
31 | appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex
32 | 
33 | # Set the default spark-shell/spark-sql log level to WARN. When running the
34 | # spark-shell/spark-sql, the log level for these classes is used to overwrite
35 | # the root logger's log level, so that the user can have different defaults
36 | # for the shell and regular Spark apps.
37 | logger.repl.name = org.apache.spark.repl.Main
38 | logger.repl.level = warn
39 | 
40 | logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver
41 | logger.thriftserver.level = warn
42 | 
43 | # Settings to quiet third party logs that are too verbose
44 | logger.jetty1.name = org.sparkproject.jetty
45 | logger.jetty1.level = warn
46 | logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle
47 | logger.jetty2.level = error
48 | logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper
49 | logger.replexprTyper.level = info
50 | logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter
51 | logger.replSparkILoopInterpreter.level = info
52 | logger.parquet1.name = org.apache.parquet
53 | logger.parquet1.level = error
54 | logger.parquet2.name = parquet
55 | logger.parquet2.level = error
56 | 
57 | # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support
58 | logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
59 | logger.RetryingHMSHandler.level = fatal
60 | logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
61 | logger.FunctionRegistry.level = error
62 | 
63 | # For deploying Spark ThriftServer
64 | # SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805
65 | appender.console.filter.1.type = RegexFilter
66 | appender.console.filter.1.regex = .*Thrift error occurred during processing of message.*
67 | appender.console.filter.1.onMatch = deny
68 | appender.console.filter.1.onMismatch = neutral
69 | 
70 | # Set G-Research Spark logging to DEBUG
71 | logger.GRSpark.name = uk.co.gresearch.spark
72 | logger.GRSpark.level = debug
73 | 


--------------------------------------------------------------------------------
/src/test/scala-spark-3/uk/co/gresearch/spark/SparkSuiteHelper.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2024 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.{Dataset, Encoder}
20 | 
21 | trait SparkSuiteHelper {
22 |   self: SparkTestSession =>
23 |   def createEmptyDataset[T : Encoder](): Dataset[T] = {
24 |     spark.emptyDataset[T](implicitly[Encoder[T]])
25 |   }
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/scala-spark-4/uk/co/gresearch/spark/SparkSuiteHelper.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2024 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.sql.{Encoder, classic}
20 | import org.apache.spark.sql.classic.Dataset
21 | 
22 | trait SparkSuiteHelper {
23 |   self: SparkTestSession =>
24 |   def createEmptyDataset[T : Encoder](): Dataset[T] = {
25 |     spark.emptyDataset[T](implicitly[Encoder[T]]).asInstanceOf[classic.Dataset[T]]
26 |   }
27 | }
28 | 


--------------------------------------------------------------------------------
/src/test/scala/uk/co/gresearch/spark/SparkTestSession.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.spark.SparkContext
20 | import org.apache.spark.sql.catalyst.plans.SQLHelper
21 | import org.apache.spark.sql.{SQLContext, SparkSession}
22 | 
23 | trait SparkTestSession extends SQLHelper {
24 | 
25 |   lazy val spark: SparkSession = {
26 |     SparkSession
27 |       .builder()
28 |       .master("local[1]")
29 |       .appName("spark test example")
30 |       .config("spark.sql.shuffle.partitions", 2)
31 |       .config("spark.local.dir", ".")
32 |       .enableHiveSupport()
33 |       .getOrCreate()
34 |   }
35 | 
36 |   lazy val sc: SparkContext = spark.sparkContext
37 | 
38 |   lazy val sql: SQLContext = spark.sqlContext
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/test/scala/uk/co/gresearch/spark/diff/AppSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2023 G-Research
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *      http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package uk.co.gresearch.spark.diff
 18 | 
 19 | import org.apache.spark.sql.SaveMode
 20 | import org.scalatest.funsuite.AnyFunSuite
 21 | import uk.co.gresearch.spark.SparkTestSession
 22 | 
 23 | import java.io.File
 24 | 
 25 | class AppSuite extends AnyFunSuite with SparkTestSession {
 26 | 
 27 |   import spark.implicits._
 28 | 
 29 |   test("run app with file and hive table") {
 30 |     withTempPath { path =>
 31 |       // write left dataframe as csv
 32 |       val leftPath = new File(path, "left.csv").getAbsolutePath
 33 |       DiffSuite.left(spark).write.csv(leftPath)
 34 | 
 35 |       // write right dataframe as parquet table
 36 |       DiffSuite.right(spark).write.format("parquet").mode(SaveMode.Overwrite).saveAsTable("right_parquet")
 37 | 
 38 |       // launch app
 39 |       val jsonPath = new File(path, "diff.json").getAbsolutePath
 40 |       App.main(
 41 |         Array(
 42 |           "--left-format",
 43 |           "csv",
 44 |           "--left-schema",
 45 |           "id int, value string",
 46 |           "--output-format",
 47 |           "json",
 48 |           "--id",
 49 |           "id",
 50 |           leftPath,
 51 |           "right_parquet",
 52 |           jsonPath
 53 |         )
 54 |       )
 55 | 
 56 |       // assert written diff
 57 |       val actual = spark.read.json(jsonPath)
 58 |       assert(actual.orderBy($"id").collect() === DiffSuite.expectedDiff)
 59 |     }
 60 |   }
 61 | 
 62 |   Seq(Set("I"), Set("C"), Set("D"), Set("N"), Set("I", "C", "D")).foreach { filter =>
 63 |     test(s"run app with filter ${filter.mkString("[", ",", "]")}") {
 64 |       withTempPath { path =>
 65 |         // write left dataframe as parquet
 66 |         val leftPath = new File(path, "left.parquet").getAbsolutePath
 67 |         DiffSuite.left(spark).write.parquet(leftPath)
 68 | 
 69 |         // write right dataframe as csv
 70 |         val rightPath = new File(path, "right.parquet").getAbsolutePath
 71 |         DiffSuite.right(spark).write.parquet(rightPath)
 72 | 
 73 |         // launch app
 74 |         val outputPath = new File(path, "diff.parquet").getAbsolutePath
 75 |         App.main(
 76 |           Array(
 77 |             "--format",
 78 |             "parquet",
 79 |             "--id",
 80 |             "id",
 81 |           ) ++ filter.toSeq.flatMap(f => Array("--filter", f)) ++ Array(
 82 |             leftPath,
 83 |             rightPath,
 84 |             outputPath
 85 |           )
 86 |         )
 87 | 
 88 |         // assert written diff
 89 |         val actual = spark.read.parquet(outputPath).orderBy($"id").collect()
 90 |         val expected = DiffSuite.expectedDiff.filter(row => filter.contains(row.getString(0)))
 91 |         assert(actual === expected)
 92 |         assert(expected.nonEmpty)
 93 |       }
 94 |     }
 95 |   }
 96 | 
 97 |   test(s"run app with unknown filter") {
 98 |     withTempPath { path =>
 99 |       // write left dataframe as parquet
100 |       val leftPath = new File(path, "left.parquet").getAbsolutePath
101 |       DiffSuite.left(spark).write.parquet(leftPath)
102 | 
103 |       // write right dataframe as csv
104 |       val rightPath = new File(path, "right.parquet").getAbsolutePath
105 |       DiffSuite.right(spark).write.parquet(rightPath)
106 | 
107 |       // launch app
108 |       val outputPath = new File(path, "diff.parquet").getAbsolutePath
109 |       assertThrows[RuntimeException](
110 |         App.main(
111 |           Array(
112 |             "--format",
113 |             "parquet",
114 |             "--id",
115 |             "id",
116 |             "--filter",
117 |             "A",
118 |             leftPath,
119 |             rightPath,
120 |             outputPath
121 |           )
122 |         )
123 |       )
124 |     }
125 |   }
126 | 
127 |   test("run app writing stats") {
128 |     withTempPath { path =>
129 |       // write left dataframe as parquet
130 |       val leftPath = new File(path, "left.parquet").getAbsolutePath
131 |       DiffSuite.left(spark).write.parquet(leftPath)
132 | 
133 |       // write right dataframe as csv
134 |       val rightPath = new File(path, "right.parquet").getAbsolutePath
135 |       DiffSuite.right(spark).write.parquet(rightPath)
136 | 
137 |       // launch app
138 |       val outputPath = new File(path, "diff.parquet").getAbsolutePath
139 |       App.main(
140 |         Array(
141 |           "--format",
142 |           "parquet",
143 |           "--statistics",
144 |           "--id",
145 |           "id",
146 |           leftPath,
147 |           rightPath,
148 |           outputPath
149 |         )
150 |       )
151 | 
152 |       // assert written diff
153 |       val actual = spark.read.parquet(outputPath).as[(String, Long)].collect().toMap
154 |       val expected = DiffSuite.expectedDiff.groupBy(row => row.getString(0)).mapValues(_.length).toMap
155 |       assert(actual === expected)
156 |     }
157 |   }
158 | }
159 | 


--------------------------------------------------------------------------------
/src/test/scala/uk/co/gresearch/spark/diff/examples/Examples.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark.diff.examples
18 | 
19 | import org.scalatest.funsuite.AnyFunSuite
20 | import uk.co.gresearch.spark.SparkTestSession
21 | import uk.co.gresearch.spark.diff.{DatasetDiff, DiffMode, DiffOptions}
22 | 
23 | case class Value(id: Int, value: Option[String], label: Option[String])
24 | 
25 | class Examples extends AnyFunSuite with SparkTestSession {
26 | 
27 |   test("issue") {
28 |     import spark.implicits._
29 |     val originalDF =
30 |       Seq((1, "gaurav", "jaipur", 550, 70000), (2, "sunil", "noida", 600, 80000), (3, "rishi", "ahmedabad", 510, 65000))
31 |         .toDF("id", "name", "city", "credit_score", "credit_limit")
32 |     val changedDF =
33 |       Seq((1, "gaurav", "jaipur", 550, 70000), (2, "sunil", "noida", 650, 90000), (4, "Joshua", "cochin", 612, 85000))
34 |         .toDF("id", "name", "city", "credit_score", "credit_limit")
35 |     val options = DiffOptions.default.withChangeColumn("changes")
36 |     val diff = originalDF.diff(changedDF, options, "id")
37 |     diff.show(false)
38 |   }
39 | 
40 |   test("examples") {
41 |     import spark.implicits._
42 | 
43 |     val left = Seq(
44 |       Value(1, Some("one"), None),
45 |       Value(2, Some("two"), Some("number two")),
46 |       Value(3, Some("three"), Some("number three")),
47 |       Value(4, Some("four"), Some("number four")),
48 |       Value(5, Some("five"), Some("number five"))
49 |     ).toDS
50 | 
51 |     val right = Seq(
52 |       Value(1, Some("one"), Some("one")),
53 |       Value(2, Some("Two"), Some("number two")),
54 |       Value(3, Some("Three"), Some("number Three")),
55 |       Value(4, Some("four"), Some("number four")),
56 |       Value(6, Some("six"), Some("number six"))
57 |     ).toDS
58 | 
59 |     {
60 |       Seq(DiffMode.ColumnByColumn, DiffMode.SideBySide, DiffMode.LeftSide, DiffMode.RightSide).foreach { mode =>
61 |         Seq(false, true).foreach { sparse =>
62 |           val options = DiffOptions.default.withDiffMode(mode)
63 |           left.diff(right, options, "id").orderBy("id").show(false)
64 |         }
65 |       }
66 |     }
67 |   }
68 | 
69 | }
70 | 


--------------------------------------------------------------------------------
/src/test/scala/uk/co/gresearch/spark/test/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2020 G-Research
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *      http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package uk.co.gresearch.spark
18 | 
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.hadoop.fs.{FileSystem, Path}
21 | 
22 | import java.io.File
23 | 
24 | package object test {
25 | 
26 |   protected def withTempPath[T](f: File => T): T = {
27 |     val dir = File.createTempFile("test", ".tmp")
28 |     dir.delete()
29 | 
30 |     try {
31 |       f(dir)
32 |     } finally {
33 |       FileSystem.get(new Configuration()).delete(new Path(dir.getAbsolutePath), true)
34 |     }
35 |   }
36 | 
37 | }
38 | 


--------------------------------------------------------------------------------
/test-release.py:
--------------------------------------------------------------------------------
 1 | from pyspark import SparkConf
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | # noinspection PyUnresolvedReferences
 5 | import gresearch.spark.diff
 6 | 
 7 | conf = SparkConf().setAppName('integration test').setMaster('local[2]')
 8 | conf = conf.setAll([
 9 |     ('spark.ui.showConsoleProgress', 'false'),
10 |     ('spark.locality.wait', '0'),
11 | ])
12 | 
13 | spark = SparkSession \
14 |     .builder \
15 |     .config(conf=conf) \
16 |     .getOrCreate()
17 | 
18 | left = spark.createDataFrame([(1, "one"), (2, "two"), (3, "three")], ["id", "value"])
19 | right = spark.createDataFrame([(1, "one"), (2, "Two"), (4, "four")], ["id", "value"])
20 | 
21 | left.diff(right).show()
22 | 
23 | 


--------------------------------------------------------------------------------
/test-release.scala:
--------------------------------------------------------------------------------
 1 | import uk.co.gresearch.spark.diff._
 2 | 
 3 | val left = Seq((1, "one"), (2, "two"), (3, "three")).toDF("id", "value")
 4 | val right = Seq((1, "one"), (2, "Two"), (4, "four")).toDF("id", "value")
 5 | 
 6 | val diff = left.diff(right)
 7 | diff.show()
 8 | 
 9 | if (diff.collect().size == 5) { sys.exit(0) }
10 | 
11 | sys.exit(1)
12 | 
13 | 


--------------------------------------------------------------------------------
/test-release.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -eo pipefail
 4 | 
 5 | version=$(grep --max-count=1 "<version>.*</version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
 6 | 
 7 | spark_major=$(grep --max-count=1 "<spark.major.version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
 8 | spark_minor=$(grep --max-count=1 "<spark.minor.version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
 9 | spark_patch=$(grep --max-count=1 "<spark.patch.version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
10 | spark_compat="$spark_major.$spark_minor"
11 | spark="$spark_major.$spark_minor.$spark_patch"
12 | 
13 | scala_major=$(grep --max-count=1 "<scala.major.version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
14 | scala_minor=$(grep --max-count=1 "<scala.minor.version>" pom.xml | sed -E -e "s/\s*<[^>]+>//g")
15 | scala_compat="$scala_major.$scala_minor"
16 | 
17 | echo
18 | echo "Testing Spark $spark and Scala $scala_compat"
19 | echo
20 | 
21 | if [ ! -e "spark-$spark-$scala_compat" ]
22 | then
23 |     if [[ "$scala_compat" == "2.12" ]]
24 |     then
25 |         if [[ "$spark_compat" < "3.3" ]]
26 |         then
27 |             hadoop="hadoop2.7"
28 |         else
29 |             hadoop="hadoop3"
30 |         fi
31 |     elif [[ "$scala_compat" == "2.13" ]]
32 |     then
33 |         if [[ "$spark_compat" < "3.3" ]]
34 |         then
35 |             hadoop="hadoop3.2-scala2.13"
36 |         else
37 |             hadoop="hadoop3-scala2.13"
38 |         fi
39 |     else
40 |         hadoop="without-hadoop"
41 |     fi
42 |     wget --progress=dot:giga https://archive.apache.org/dist/spark/spark-$spark/spark-$spark-bin-$hadoop.tgz -O - | tar -xzC .
43 |     ln -s spark-$spark-bin-$hadoop spark-$spark-$scala_compat
44 | fi
45 | 
46 | echo "Testing Scala"
47 | spark-$spark-$scala_compat/bin/spark-shell --packages uk.co.gresearch.spark:spark-extension_$scala_compat:$version --repositories https://oss.sonatype.org/content/groups/staging/ < test-release.scala
48 | 
49 | echo "Testing Python with Scala package"
50 | spark-$spark-$scala_compat/bin/spark-submit --packages uk.co.gresearch.spark:spark-extension_$scala_compat:$version test-release.py
51 | 
52 | if [ "$scala_compat" == "2.12" ]
53 | then
54 |     echo "Testing Python with whl package"
55 |     if [ ! -e "venv-$spark" ]
56 |     then
57 |       python -m venv venv-$spark
58 |     fi
59 |     source venv-$spark/bin/activate
60 |     pip install -r python/requirements-${spark_compat}_$scala_compat.txt
61 |     pip install python/dist/pyspark_extension-${version/-*/}.$spark_compat${version/*-SNAPSHOT/.dev0}-py3-none-any.whl
62 |     python3 test-release.py
63 | fi
64 | 
65 | 
66 | echo -e "\u001b[32;1mSUCCESS\u001b[0m"
67 | 


--------------------------------------------------------------------------------
/with-job-description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Research/spark-extension/b5792dea93e964ad48d202a504195b261caaa056/with-job-description.png


--------------------------------------------------------------------------------
/without-job-description.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/G-Research/spark-extension/b5792dea93e964ad48d202a504195b261caaa056/without-job-description.png


--------------------------------------------------------------------------------