├── .github ├── actions │ ├── build-whl │ │ └── action.yml │ ├── build │ │ └── action.yml │ ├── check-compat │ │ └── action.yml │ ├── prime-caches │ │ └── action.yml │ ├── test-jvm │ │ └── action.yml │ └── test-python │ │ └── action.yml ├── dependabot.yml └── workflows │ ├── build-jvm.yml │ ├── build-python.yml │ ├── build-snapshots.yml │ ├── check.yml │ ├── ci.yml │ ├── clear-caches.yaml │ ├── prepare-release.yml │ ├── prime-caches.yml │ ├── publish-release.yml │ ├── publish-snapshot.yml │ ├── test-jvm.yml │ ├── test-python.yml │ ├── test-results.yml │ └── test-snapshots.yml ├── .gitignore ├── .scalafmt.conf ├── CHANGELOG.md ├── CONDITIONAL.md ├── DIFF.md ├── GROUPS.md ├── HISTOGRAM.md ├── LICENSE ├── MAINTAINERS.md ├── PARQUET.md ├── PARTITIONING.md ├── PYSPARK-DEPS.md ├── README.md ├── RELEASE.md ├── ROW_NUMBER.md ├── SECURITY.md ├── build-whl.sh ├── bump-version.sh ├── examples └── python-deps │ ├── Dockerfile │ ├── docker-compose.yml │ └── example.py ├── pom.xml ├── python ├── README.md ├── gresearch │ ├── __init__.py │ └── spark │ │ ├── __init__.py │ │ ├── diff │ │ ├── __init__.py │ │ └── comparator │ │ │ └── __init__.py │ │ └── parquet │ │ └── __init__.py ├── pyproject.toml ├── requirements-3.0_2.12.txt ├── requirements-3.1_2.12.txt ├── requirements-3.2_2.12.txt ├── requirements-3.2_2.13.txt ├── requirements-3.3_2.12.txt ├── requirements-3.3_2.13.txt ├── requirements-3.4_2.12.txt ├── requirements-3.4_2.13.txt ├── requirements-3.5_2.12.txt ├── requirements-3.5_2.13.txt ├── requirements-4.0_2.13.txt ├── requirements-4.1_2.13.txt ├── setup.py └── test │ ├── __init__.py │ ├── requirements.txt │ ├── spark_common.py │ ├── test_diff.py │ ├── test_histogram.py │ ├── test_job_description.py │ ├── test_jvm.py │ ├── test_package.py │ ├── test_parquet.py │ └── test_row_number.py ├── release.sh ├── set-version.sh ├── src ├── main │ ├── scala-spark-3.0 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── package.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-3.1 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── package.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-3.2 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── package.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-3.3 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── package.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-3.4 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── package.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-3.5 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── package.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-4.0 │ │ ├── org │ │ │ └── apache │ │ │ │ └── spark │ │ │ │ └── sql │ │ │ │ └── extension │ │ │ │ └── extension.scala │ │ └── uk │ │ │ └── co │ │ │ └── gresearch │ │ │ └── spark │ │ │ ├── Backticks.scala │ │ │ ├── BinaryLikeWithNewChildrenInternal.scala │ │ │ ├── UnixMicros.scala │ │ │ └── parquet │ │ │ ├── BlockMetaDataUtil.scala │ │ │ ├── FileMetaDataUtil.scala │ │ │ ├── PrimitiveTypeUtil.scala │ │ │ └── SplitFile.scala │ ├── scala-spark-4.1 │ └── scala │ │ └── uk │ │ └── co │ │ └── gresearch │ │ ├── package.scala │ │ └── spark │ │ ├── BuildVersion.scala │ │ ├── Histogram.scala │ │ ├── RowNumbers.scala │ │ ├── SparkVersion.scala │ │ ├── UnpersistHandle.scala │ │ ├── diff │ │ ├── App.scala │ │ ├── Diff.scala │ │ ├── DiffComparators.scala │ │ ├── DiffOptions.scala │ │ ├── comparator │ │ │ ├── DefaultDiffComparator.scala │ │ │ ├── DiffComparator.scala │ │ │ ├── DurationDiffComparator.scala │ │ │ ├── EpsilonDiffComparator.scala │ │ │ ├── EquivDiffComparator.scala │ │ │ ├── MapDiffComparator.scala │ │ │ ├── NullSafeEqualDiffComparator.scala │ │ │ ├── TypedDiffComparator.scala │ │ │ └── WhitespaceDiffComparator.scala │ │ └── package.scala │ │ ├── group │ │ └── package.scala │ │ ├── package.scala │ │ └── parquet │ │ └── package.scala └── test │ ├── files │ ├── nested.parquet │ └── test.parquet │ │ ├── file1.parquet │ │ └── file2.parquet │ ├── java │ └── uk │ │ └── co │ │ └── gresearch │ │ └── test │ │ ├── SparkJavaTests.java │ │ └── diff │ │ ├── DiffJavaTests.java │ │ ├── JavaValue.java │ │ └── JavaValueAs.java │ ├── resources │ ├── log4j.properties │ └── log4j2.properties │ ├── scala-spark-3 │ └── uk │ │ └── co │ │ └── gresearch │ │ └── spark │ │ └── SparkSuiteHelper.scala │ ├── scala-spark-4 │ └── uk │ │ └── co │ │ └── gresearch │ │ └── spark │ │ └── SparkSuiteHelper.scala │ └── scala │ └── uk │ └── co │ └── gresearch │ └── spark │ ├── GroupBySuite.scala │ ├── HistogramSuite.scala │ ├── SparkSuite.scala │ ├── SparkTestSession.scala │ ├── WritePartitionedSuite.scala │ ├── diff │ ├── AppSuite.scala │ ├── DiffComparatorSuite.scala │ ├── DiffOptionsSuite.scala │ ├── DiffSuite.scala │ └── examples │ │ └── Examples.scala │ ├── group │ └── GroupSuite.scala │ ├── parquet │ └── ParquetSuite.scala │ └── test │ └── package.scala ├── test-release.py ├── test-release.scala ├── test-release.sh ├── with-job-description.png └── without-job-description.png /.github/actions/build-whl/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Build Whl' 2 | author: 'EnricoMi' 3 | description: 'A GitHub Action that builds pyspark-extension package' 4 | 5 | inputs: 6 | spark-version: 7 | description: Spark version, e.g. 3.4.0, 3.4.0-SNAPSHOT, or 4.0.0-preview1 8 | required: true 9 | scala-version: 10 | description: Scala version, e.g. 2.12.15 11 | required: true 12 | spark-compat-version: 13 | description: Spark compatibility version, e.g. 3.4 14 | required: true 15 | scala-compat-version: 16 | description: Scala compatibility version, e.g. 2.12 17 | required: true 18 | java-compat-version: 19 | description: Java compatibility version, e.g. 8 20 | required: true 21 | python-version: 22 | description: Python version, e.g. 3.8 23 | required: true 24 | 25 | runs: 26 | using: 'composite' 27 | steps: 28 | - name: Fetch Binaries Artifact 29 | uses: actions/download-artifact@v4 30 | with: 31 | name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }} 32 | path: . 33 | 34 | - name: Set versions in pom.xml 35 | run: | 36 | ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }} 37 | git diff 38 | shell: bash 39 | 40 | - name: Restore Maven packages cache 41 | if: github.event_name != 'schedule' 42 | uses: actions/cache/restore@v4 43 | with: 44 | path: ~/.m2/repository 45 | key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 46 | restore-keys: 47 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 48 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}- 49 | 50 | - name: Setup JDK ${{ inputs.java-compat-version }} 51 | uses: actions/setup-java@v4 52 | with: 53 | java-version: ${{ inputs.java-compat-version }} 54 | distribution: 'zulu' 55 | 56 | - name: Setup Python 57 | uses: actions/setup-python@v5 58 | with: 59 | python-version: ${{ inputs.python-version }} 60 | 61 | - name: Install Python dependencies 62 | run: | 63 | # Install Python dependencies 64 | echo "::group::mvn compile" 65 | python -m pip install --upgrade pip build twine 66 | echo "::endgroup::" 67 | shell: bash 68 | 69 | - name: Build whl 70 | run: | 71 | # Build whl 72 | echo "::group::build-whl.sh" 73 | ./build-whl.sh 74 | echo "::endgroup::" 75 | shell: bash 76 | 77 | - name: Test whl 78 | run: | 79 | # Test whl 80 | echo "::group::test-release.py" 81 | twine check python/dist/* 82 | pip install -r python/requirements-${{ inputs.spark-compat-version }}_${{ inputs.scala-compat-version }}.txt 83 | pip install python/dist/*.whl 84 | python test-release.py 85 | echo "::endgroup::" 86 | shell: bash 87 | 88 | - name: Upload whl 89 | uses: actions/upload-artifact@v4 90 | with: 91 | name: Whl (Spark ${{ inputs.spark-compat-version }} Scala ${{ inputs.scala-compat-version }}) 92 | path: | 93 | python/dist/*.whl 94 | 95 | branding: 96 | icon: 'check-circle' 97 | color: 'green' 98 | -------------------------------------------------------------------------------- /.github/actions/build/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Build' 2 | author: 'EnricoMi' 3 | description: 'A GitHub Action that builds spark-extension' 4 | 5 | inputs: 6 | spark-version: 7 | description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT 8 | required: true 9 | scala-version: 10 | description: Scala version, e.g. 2.12.15 11 | required: true 12 | spark-compat-version: 13 | description: Spark compatibility version, e.g. 3.4 14 | required: true 15 | scala-compat-version: 16 | description: Scala compatibility version, e.g. 2.12 17 | required: true 18 | java-compat-version: 19 | description: Java compatibility version, e.g. 8 20 | required: true 21 | hadoop-version: 22 | description: Hadoop version, e.g. 2.7 or 2 23 | required: true 24 | 25 | runs: 26 | using: 'composite' 27 | steps: 28 | - name: Set versions in pom.xml 29 | run: | 30 | ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }} 31 | git diff 32 | shell: bash 33 | 34 | - name: Restore Maven packages cache 35 | if: github.event_name != 'schedule' 36 | uses: actions/cache/restore@v4 37 | with: 38 | path: ~/.m2/repository 39 | key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 40 | restore-keys: 41 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 42 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}- 43 | 44 | - name: Setup JDK ${{ inputs.java-compat-version }} 45 | uses: actions/setup-java@v4 46 | with: 47 | java-version: ${{ inputs.java-compat-version }} 48 | distribution: 'zulu' 49 | 50 | - name: Build 51 | env: 52 | JDK_JAVA_OPTIONS: --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.base/sun.util.calendar=ALL-UNNAMED 53 | run: | 54 | # Build 55 | echo "::group::mvn compile" 56 | mvn --batch-mode --update-snapshots -Dspotless.check.skip clean compile test-compile 57 | echo "::endgroup::" 58 | 59 | echo "::group::mvn package" 60 | mvn --batch-mode package -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true 61 | echo "::endgroup::" 62 | 63 | echo "::group::mvn install" 64 | mvn --batch-mode install -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true -Dgpg.skip 65 | echo "::endgroup::" 66 | shell: bash 67 | 68 | - name: Upload Binaries 69 | uses: actions/upload-artifact@v4 70 | with: 71 | name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }} 72 | path: | 73 | * 74 | !.* 75 | !target/*-javadoc.jar 76 | !target/*-sources.jar 77 | !target/site 78 | 79 | branding: 80 | icon: 'check-circle' 81 | color: 'green' 82 | -------------------------------------------------------------------------------- /.github/actions/check-compat/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Check' 2 | author: 'EnricoMi' 3 | description: 'A GitHub Action that checks compatibility of spark-extension' 4 | 5 | inputs: 6 | spark-version: 7 | description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT 8 | required: true 9 | scala-version: 10 | description: Scala version, e.g. 2.12.15 11 | required: true 12 | spark-compat-version: 13 | description: Spark compatibility version, e.g. 3.4 14 | required: true 15 | scala-compat-version: 16 | description: Scala compatibility version, e.g. 2.12 17 | required: true 18 | package-version: 19 | description: Spark-Extension version to check against 20 | required: true 21 | 22 | runs: 23 | using: 'composite' 24 | steps: 25 | - name: Fetch Binaries Artifact 26 | uses: actions/download-artifact@v4 27 | with: 28 | name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }} 29 | path: . 30 | 31 | - name: Set versions in pom.xml 32 | run: | 33 | ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }} 34 | git diff 35 | shell: bash 36 | 37 | - name: Restore Maven packages cache 38 | if: github.event_name != 'schedule' 39 | uses: actions/cache/restore@v4 40 | with: 41 | path: ~/.m2/repository 42 | key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 43 | restore-keys: 44 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 45 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}- 46 | 47 | - name: Setup JDK 1.8 48 | uses: actions/setup-java@v4 49 | with: 50 | java-version: '8' 51 | distribution: 'zulu' 52 | 53 | - name: Install Checker 54 | run: | 55 | # Install Checker 56 | echo "::group::apt update install" 57 | sudo apt update 58 | sudo apt install japi-compliance-checker 59 | echo "::endgroup::" 60 | shell: bash 61 | 62 | - name: Release exists 63 | id: exists 64 | continue-on-error: true 65 | run: | 66 | # Release exists 67 | curl --head --fail https://repo1.maven.org/maven2/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar 68 | shell: bash 69 | 70 | - name: Fetch package 71 | if: steps.exists.outcome == 'success' 72 | run: | 73 | # Fetch package 74 | echo "::group::mvn dependency:get" 75 | mvn dependency:get -Dtransitive=false -DremoteRepositories -Dartifact=uk.co.gresearch.spark:spark-extension_${{ inputs.scala-compat-version }}:${{ inputs.package-version }}-${{ inputs.spark-compat-version }} 76 | echo "::endgroup::" 77 | shell: bash 78 | 79 | - name: Check 80 | if: steps.exists.outcome == 'success' 81 | continue-on-error: ${{ github.ref == 'refs/heads/master' }} 82 | run: | 83 | # Check 84 | echo "::group::japi-compliance-checker" 85 | ls -lah ~/.m2/repository/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar target/spark-extension*.jar 86 | japi-compliance-checker ~/.m2/repository/uk/co/gresearch/spark/spark-extension_${{ inputs.scala-compat-version }}/${{ inputs.package-version }}-${{ inputs.spark-compat-version }}/spark-extension_${{ inputs.scala-compat-version }}-${{ inputs.package-version }}-${{ inputs.spark-compat-version }}.jar target/spark-extension*.jar 87 | echo "::endgroup::" 88 | shell: bash 89 | 90 | - name: Upload Report 91 | uses: actions/upload-artifact@v4 92 | if: always() && steps.exists.outcome == 'success' 93 | with: 94 | name: Compat-Report-${{ inputs.spark-compat-version }} 95 | path: compat_reports/spark-extension/* 96 | 97 | branding: 98 | icon: 'check-circle' 99 | color: 'green' 100 | -------------------------------------------------------------------------------- /.github/actions/prime-caches/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Prime caches' 2 | author: 'EnricoMi' 3 | description: 'A GitHub Action that primes caches' 4 | 5 | inputs: 6 | spark-version: 7 | description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT 8 | required: true 9 | scala-version: 10 | description: Scala version, e.g. 2.12.15 11 | required: true 12 | spark-compat-version: 13 | description: Spark compatibility version, e.g. 3.4 14 | required: true 15 | scala-compat-version: 16 | description: Scala compatibility version, e.g. 2.12 17 | required: true 18 | java-compat-version: 19 | description: Java compatibility version, e.g. 8 20 | required: true 21 | hadoop-version: 22 | description: Hadoop version, e.g. 2.7 or 2 23 | required: true 24 | 25 | runs: 26 | using: 'composite' 27 | steps: 28 | - name: Set versions in pom.xml 29 | run: | 30 | ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }} 31 | git diff 32 | shell: bash 33 | 34 | - name: Setup JDK ${{ inputs.java-compat-version }} 35 | uses: actions/setup-java@v4 36 | with: 37 | java-version: ${{ inputs.java-compat-version }} 38 | distribution: 'zulu' 39 | 40 | - name: Build 41 | env: 42 | JDK_JAVA_OPTIONS: --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.base/sun.util.calendar=ALL-UNNAMED 43 | run: | 44 | # Build 45 | echo "::group::mvn dependency:go-offline" 46 | mvn --batch-mode dependency:go-offline 47 | echo "::endgroup::" 48 | shell: bash 49 | 50 | - name: Save Maven packages cache 51 | uses: actions/cache/save@v4 52 | with: 53 | path: ~/.m2/repository 54 | key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }}-${{ github.run_id }} 55 | 56 | - name: Setup Spark Binaries 57 | if: ( ! contains(inputs.spark-version, '-SNAPSHOT') ) 58 | env: 59 | SPARK_PACKAGE: spark-${{ inputs.spark-version }}/spark-${{ inputs.spark-version }}-bin-hadoop${{ inputs.hadoop-version }}${{ inputs.scala-compat-version == '2.13' && '-scala2.13' || '' }}.tgz 60 | run: | 61 | wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/${SPARK_PACKAGE}?action=download" -O - | tar -xzC "${{ runner.temp }}" 62 | archive=$(basename "${SPARK_PACKAGE}") bash -c "mv -v "${{ runner.temp }}/\${archive/%.tgz/}" ~/spark" 63 | shell: bash 64 | 65 | - name: Save Spark Binaries cache 66 | if: ( ! contains(inputs.spark-version, '-SNAPSHOT') ) 67 | uses: actions/cache/save@v4 68 | with: 69 | path: ~/spark 70 | key: ${{ runner.os }}-spark-binaries-${{ inputs.spark-version }}-${{ inputs.scala-compat-version }}-${{ github.run_id }} 71 | 72 | branding: 73 | icon: 'check-circle' 74 | color: 'green' 75 | -------------------------------------------------------------------------------- /.github/actions/test-jvm/action.yml: -------------------------------------------------------------------------------- 1 | name: 'Test JVM' 2 | author: 'EnricoMi' 3 | description: 'A GitHub Action that tests JVM spark-extension' 4 | 5 | inputs: 6 | spark-version: 7 | description: Spark version, e.g. 3.4.0 or 3.4.0-SNAPSHOT 8 | required: true 9 | scala-version: 10 | description: Scala version, e.g. 2.12.15 11 | required: true 12 | spark-compat-version: 13 | description: Spark compatibility version, e.g. 3.4 14 | required: true 15 | scala-compat-version: 16 | description: Scala compatibility version, e.g. 2.12 17 | required: true 18 | hadoop-version: 19 | description: Hadoop version, e.g. 2.7 or 2 20 | required: true 21 | java-compat-version: 22 | description: Java compatibility version, e.g. 8 23 | required: true 24 | 25 | runs: 26 | using: 'composite' 27 | steps: 28 | - name: Fetch Binaries Artifact 29 | uses: actions/download-artifact@v4 30 | with: 31 | name: Binaries-${{ inputs.spark-compat-version }}-${{ inputs.scala-compat-version }} 32 | path: . 33 | 34 | - name: Set versions in pom.xml 35 | run: | 36 | ./set-version.sh ${{ inputs.spark-version }} ${{ inputs.scala-version }} 37 | git diff 38 | shell: bash 39 | 40 | - name: Restore Spark Binaries cache 41 | if: github.event_name != 'schedule' && ! contains(inputs.spark-version, '-SNAPSHOT') 42 | uses: actions/cache/restore@v4 43 | with: 44 | path: ~/spark 45 | key: ${{ runner.os }}-spark-binaries-${{ inputs.spark-version }}-${{ inputs.scala-compat-version }} 46 | restore-keys: 47 | ${{ runner.os }}-spark-binaries-${{ inputs.spark-version }}-${{ inputs.scala-compat-version }} 48 | 49 | - name: Setup Spark Binaries 50 | if: ( ! contains(inputs.spark-version, '-SNAPSHOT') ) 51 | env: 52 | SPARK_PACKAGE: spark-${{ inputs.spark-version }}/spark-${{ inputs.spark-version }}-bin-hadoop${{ inputs.hadoop-version }}${{ inputs.scala-compat-version == '2.13' && '-scala2.13' || '' }}.tgz 53 | run: | 54 | # Setup Spark Binaries 55 | if [[ ! -e ~/spark ]] 56 | then 57 | wget --progress=dot:giga "https://www.apache.org/dyn/closer.lua/spark/${SPARK_PACKAGE}?action=download" -O - | tar -xzC "${{ runner.temp }}" 58 | archive=$(basename "${SPARK_PACKAGE}") bash -c "mv -v "${{ runner.temp }}/\${archive/%.tgz/}" ~/spark" 59 | fi 60 | echo "SPARK_HOME=$(cd ~/spark; pwd)" >> $GITHUB_ENV 61 | shell: bash 62 | 63 | - name: Restore Maven packages cache 64 | if: github.event_name != 'schedule' 65 | uses: actions/cache/restore@v4 66 | with: 67 | path: ~/.m2/repository 68 | key: ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 69 | restore-keys: 70 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}-${{ hashFiles('pom.xml') }} 71 | ${{ runner.os }}-mvn-build-${{ inputs.spark-version }}-${{ inputs.scala-version }}- 72 | 73 | - name: Setup JDK ${{ inputs.java-compat-version }} 74 | uses: actions/setup-java@v4 75 | with: 76 | java-version: ${{ inputs.java-compat-version }} 77 | distribution: 'zulu' 78 | 79 | - name: Scala and Java Tests 80 | env: 81 | JDK_JAVA_OPTIONS: --add-exports java.base/sun.nio.ch=ALL-UNNAMED --add-exports java.base/sun.util.calendar=ALL-UNNAMED 82 | run: | 83 | # Scala and Java Tests 84 | echo "::group::mvn test" 85 | mvn --batch-mode --update-snapshots -Dspotless.check.skip test 86 | echo "::endgroup::" 87 | shell: bash 88 | 89 | - name: Diff App test 90 | if: ( ! contains(inputs.spark-version, '-SNAPSHOT') ) 91 | run: | 92 | # Diff App test 93 | echo "::group::spark-submit" 94 | $SPARK_HOME/bin/spark-submit --packages com.github.scopt:scopt_${{ inputs.scala-compat-version }}:4.1.0 target/spark-extension_*.jar --format parquet --id id src/test/files/test.parquet/file1.parquet src/test/files/test.parquet/file2.parquet diff.parquet 95 | echo "::endgroup::" 96 | 97 | echo "::group::spark-shell" 98 | $SPARK_HOME/bin/spark-shell <<< 'val df = spark.read.parquet("diff.parquet").orderBy($"id").groupBy($"diff").count; df.show; if (df.count != 2) sys.exit(1)' 99 | echo "::endgroup::" 100 | shell: bash 101 | 102 | - name: Generate Unit Test Report 103 | if: failure() 104 | run: | 105 | # Generate Unit Test Report 106 | echo "::group::mvn report-only" 107 | mvn --batch-mode surefire-report:report-only 108 | echo "::endgroup::" 109 | shell: bash 110 | 111 | - name: Upload Unit Test Results 112 | if: always() 113 | uses: actions/upload-artifact@v4 114 | with: 115 | name: JVM Test Results (Spark ${{ inputs.spark-version }} Scala ${{ inputs.scala-version }}) 116 | path: | 117 | target/surefire-reports/*.xml 118 | !target/surefire-reports/TEST-org.scalatest*.xml 119 | target/site/surefire-report.html 120 | 121 | branding: 122 | icon: 'check-circle' 123 | color: 'green' 124 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | updates: 3 | - package-ecosystem: "github-actions" 4 | directory: "/" 5 | schedule: 6 | interval: "monthly" 7 | 8 | - package-ecosystem: "maven" 9 | directory: "/" 10 | schedule: 11 | interval: "daily" 12 | -------------------------------------------------------------------------------- /.github/workflows/build-jvm.yml: -------------------------------------------------------------------------------- 1 | name: Build JVM 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | build: 8 | name: Build (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }}) 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | include: 15 | - spark-compat-version: '3.0' 16 | spark-version: '3.0.3' 17 | scala-compat-version: '2.12' 18 | scala-version: '2.12.10' 19 | hadoop-version: '2.7' 20 | - spark-compat-version: '3.1' 21 | spark-version: '3.1.3' 22 | scala-compat-version: '2.12' 23 | scala-version: '2.12.10' 24 | hadoop-version: '2.7' 25 | - spark-compat-version: '3.2' 26 | spark-version: '3.2.4' 27 | scala-compat-version: '2.12' 28 | scala-version: '2.12.15' 29 | hadoop-version: '2.7' 30 | - spark-compat-version: '3.3' 31 | spark-version: '3.3.4' 32 | scala-compat-version: '2.12' 33 | scala-version: '2.12.15' 34 | hadoop-version: '3' 35 | - spark-compat-version: '3.4' 36 | scala-compat-version: '2.12' 37 | scala-version: '2.12.17' 38 | spark-version: '3.4.4' 39 | hadoop-version: '3' 40 | - spark-compat-version: '3.5' 41 | scala-compat-version: '2.12' 42 | scala-version: '2.12.18' 43 | spark-version: '3.5.5' 44 | hadoop-version: '3' 45 | 46 | - spark-compat-version: '3.2' 47 | spark-version: '3.2.4' 48 | scala-compat-version: '2.13' 49 | scala-version: '2.13.5' 50 | hadoop-version: '3.2' 51 | - spark-compat-version: '3.3' 52 | spark-version: '3.3.4' 53 | scala-compat-version: '2.13' 54 | scala-version: '2.13.8' 55 | hadoop-version: '3' 56 | - spark-compat-version: '3.4' 57 | scala-compat-version: '2.13' 58 | scala-version: '2.13.8' 59 | spark-version: '3.4.4' 60 | hadoop-version: '3' 61 | - spark-compat-version: '3.5' 62 | scala-compat-version: '2.13' 63 | scala-version: '2.13.8' 64 | spark-version: '3.5.5' 65 | hadoop-version: '3' 66 | 67 | steps: 68 | - name: Checkout 69 | uses: actions/checkout@v4 70 | 71 | - name: Build 72 | uses: ./.github/actions/build 73 | with: 74 | spark-version: ${{ matrix.spark-version }} 75 | scala-version: ${{ matrix.scala-version }} 76 | spark-compat-version: ${{ matrix.spark-compat-version }} 77 | scala-compat-version: ${{ matrix.scala-compat-version }} 78 | hadoop-version: ${{ matrix.hadoop-version }} 79 | java-compat-version: '8' 80 | -------------------------------------------------------------------------------- /.github/workflows/build-python.yml: -------------------------------------------------------------------------------- 1 | name: Build Python 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | # pyspark<4 is not available for snapshots or scala other than 2.12 8 | whl: 9 | name: Build whl (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }}) 10 | runs-on: ubuntu-latest 11 | 12 | strategy: 13 | fail-fast: false 14 | matrix: 15 | include: 16 | - spark-compat-version: '3.0' 17 | spark-version: '3.0.3' 18 | scala-compat-version: '2.12' 19 | scala-version: '2.12.10' 20 | java-compat-version: '8' 21 | - spark-compat-version: '3.1' 22 | spark-version: '3.1.3' 23 | scala-compat-version: '2.12' 24 | scala-version: '2.12.10' 25 | java-compat-version: '8' 26 | - spark-compat-version: '3.2' 27 | spark-version: '3.2.4' 28 | scala-compat-version: '2.12' 29 | scala-version: '2.12.15' 30 | java-compat-version: '8' 31 | - spark-compat-version: '3.3' 32 | spark-version: '3.3.4' 33 | scala-compat-version: '2.12' 34 | scala-version: '2.12.15' 35 | java-compat-version: '8' 36 | - spark-compat-version: '3.4' 37 | spark-version: '3.4.4' 38 | scala-compat-version: '2.12' 39 | scala-version: '2.12.17' 40 | java-compat-version: '8' 41 | - spark-compat-version: '3.5' 42 | spark-version: '3.5.5' 43 | scala-compat-version: '2.12' 44 | scala-version: '2.12.18' 45 | java-compat-version: '8' 46 | 47 | steps: 48 | - name: Checkout 49 | uses: actions/checkout@v4 50 | 51 | - name: Build 52 | uses: ./.github/actions/build-whl 53 | with: 54 | spark-version: ${{ matrix.spark-version }} 55 | scala-version: ${{ matrix.scala-version }} 56 | spark-compat-version: ${{ matrix.spark-compat-version }} 57 | scala-compat-version: ${{ matrix.scala-compat-version }} 58 | java-compat-version: ${{ matrix.java-compat-version }} 59 | python-version: "3.9" 60 | -------------------------------------------------------------------------------- /.github/workflows/build-snapshots.yml: -------------------------------------------------------------------------------- 1 | name: Build Snapshots 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | build: 8 | name: Build (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }}) 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | include: 15 | - spark-compat-version: '3.2' 16 | spark-version: '3.2.5-SNAPSHOT' 17 | scala-compat-version: '2.12' 18 | scala-version: '2.12.15' 19 | java-compat-version: '8' 20 | - spark-compat-version: '3.3' 21 | spark-version: '3.3.5-SNAPSHOT' 22 | scala-compat-version: '2.12' 23 | scala-version: '2.12.15' 24 | java-compat-version: '8' 25 | - spark-compat-version: '3.4' 26 | spark-version: '3.4.5-SNAPSHOT' 27 | scala-compat-version: '2.12' 28 | scala-version: '2.12.17' 29 | java-compat-version: '8' 30 | - spark-compat-version: '3.5' 31 | spark-version: '3.5.6-SNAPSHOT' 32 | scala-compat-version: '2.12' 33 | scala-version: '2.12.18' 34 | java-compat-version: '8' 35 | 36 | - spark-compat-version: '3.2' 37 | spark-version: '3.2.5-SNAPSHOT' 38 | scala-compat-version: '2.13' 39 | scala-version: '2.13.5' 40 | java-compat-version: '8' 41 | - spark-compat-version: '3.3' 42 | spark-version: '3.3.5-SNAPSHOT' 43 | scala-compat-version: '2.13' 44 | scala-version: '2.13.8' 45 | java-compat-version: '8' 46 | - spark-compat-version: '3.4' 47 | spark-version: '3.4.5-SNAPSHOT' 48 | scala-compat-version: '2.13' 49 | scala-version: '2.13.8' 50 | java-compat-version: '8' 51 | - spark-compat-version: '3.5' 52 | spark-version: '3.5.6-SNAPSHOT' 53 | scala-compat-version: '2.13' 54 | scala-version: '2.13.8' 55 | java-compat-version: '8' 56 | - spark-compat-version: '4.0' 57 | spark-version: '4.0.1-SNAPSHOT' 58 | scala-compat-version: '2.13' 59 | scala-version: '2.13.16' 60 | java-compat-version: '17' 61 | - spark-compat-version: '4.1' 62 | spark-version: '4.1.0-SNAPSHOT' 63 | scala-compat-version: '2.13' 64 | scala-version: '2.13.16' 65 | java-compat-version: '17' 66 | 67 | steps: 68 | - name: Checkout 69 | uses: actions/checkout@v4 70 | 71 | - name: Build 72 | uses: ./.github/actions/build 73 | with: 74 | spark-version: ${{ matrix.spark-version }} 75 | scala-version: ${{ matrix.scala-version }} 76 | spark-compat-version: ${{ matrix.spark-compat-version }}-SNAPSHOT 77 | scala-compat-version: ${{ matrix.scala-compat-version }} 78 | java-compat-version: ${{ matrix.java-compat-version }} 79 | -------------------------------------------------------------------------------- /.github/workflows/check.yml: -------------------------------------------------------------------------------- 1 | name: Check 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | lint: 8 | name: Scala lint 9 | runs-on: ubuntu-latest 10 | 11 | steps: 12 | - name: Checkout 13 | uses: actions/checkout@v4 14 | with: 15 | fetch-depth: 0 16 | 17 | - name: Setup JDK ${{ inputs.java-compat-version }} 18 | uses: actions/setup-java@v4 19 | with: 20 | java-version: '11' 21 | distribution: 'zulu' 22 | 23 | - name: Check 24 | id: check 25 | run: | 26 | mvn --batch-mode --update-snapshots spotless:check 27 | shell: bash 28 | 29 | - name: Changes 30 | if: failure() && steps.check.outcome == 'failure' 31 | run: | 32 | mvn --batch-mode --update-snapshots spotless:apply 33 | git diff 34 | shell: bash 35 | 36 | config: 37 | name: Configure compat 38 | runs-on: ubuntu-latest 39 | outputs: 40 | major-version: ${{ steps.versions.outputs.major-version }} 41 | release-version: ${{ steps.versions.outputs.release-version }} 42 | release-major-version: ${{ steps.versions.outputs.release-major-version }} 43 | 44 | steps: 45 | - name: Checkout 46 | uses: actions/checkout@v4 47 | with: 48 | fetch-depth: 0 49 | 50 | - name: Get versions 51 | id: versions 52 | run: | 53 | version=$(grep -m1 version pom.xml | sed -e "s/<[^>]*>//g" -e "s/ //g") 54 | echo "version: $version" 55 | echo "major-version: ${version/.*/}" 56 | echo "version=$version" >> "$GITHUB_OUTPUT" 57 | echo "major-version=${version/.*/}" >> "$GITHUB_OUTPUT" 58 | release_version=$(git tag | grep "^v" | sort --version-sort | tail -n1 | sed "s/^v//") 59 | echo "release-version: $release_version" 60 | echo "release-major-version: ${release_version/.*/}" 61 | echo "release-version=$release_version" >> "$GITHUB_OUTPUT" 62 | echo "release-major-version=${release_version/.*/}" >> "$GITHUB_OUTPUT" 63 | shell: bash 64 | 65 | compat: 66 | name: Compat (Spark ${{ matrix.spark-compat-version }} Scala ${{ matrix.scala-compat-version }}) 67 | needs: config 68 | runs-on: ubuntu-latest 69 | if: needs.config.outputs.major-version == needs.config.outputs.release-major-version 70 | 71 | strategy: 72 | fail-fast: false 73 | matrix: 74 | include: 75 | - spark-compat-version: '3.0' 76 | spark-version: '3.0.3' 77 | scala-compat-version: '2.12' 78 | scala-version: '2.12.10' 79 | - spark-compat-version: '3.1' 80 | spark-version: '3.1.3' 81 | scala-compat-version: '2.12' 82 | scala-version: '2.12.10' 83 | - spark-compat-version: '3.2' 84 | spark-version: '3.2.4' 85 | scala-compat-version: '2.12' 86 | scala-version: '2.12.15' 87 | - spark-compat-version: '3.3' 88 | spark-version: '3.3.3' 89 | scala-compat-version: '2.12' 90 | scala-version: '2.12.15' 91 | - spark-compat-version: '3.4' 92 | scala-compat-version: '2.12' 93 | scala-version: '2.12.17' 94 | spark-version: '3.4.2' 95 | - spark-compat-version: '3.5' 96 | scala-compat-version: '2.12' 97 | scala-version: '2.12.18' 98 | spark-version: '3.5.0' 99 | 100 | steps: 101 | - name: Checkout 102 | uses: actions/checkout@v4 103 | 104 | - name: Check 105 | uses: ./.github/actions/check-compat 106 | with: 107 | spark-version: ${{ matrix.spark-version }} 108 | scala-version: ${{ matrix.scala-version }} 109 | spark-compat-version: ${{ matrix.spark-compat-version }} 110 | scala-compat-version: ${{ matrix.scala-compat-version }} 111 | package-version: ${{ needs.config.outputs.release-version }} 112 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: CI 2 | 3 | on: 4 | schedule: 5 | - cron: '0 8 */10 * *' 6 | push: 7 | tags: 8 | - '*' 9 | merge_group: 10 | pull_request: 11 | workflow_dispatch: 12 | 13 | jobs: 14 | event_file: 15 | name: "Event File" 16 | runs-on: ubuntu-latest 17 | steps: 18 | - name: Upload 19 | uses: actions/upload-artifact@v4 20 | with: 21 | name: Event File 22 | path: ${{ github.event_path }} 23 | 24 | build-jvm: 25 | name: "Build JVM" 26 | uses: "./.github/workflows/build-jvm.yml" 27 | build-snapshots: 28 | name: "Build Snapshots" 29 | uses: "./.github/workflows/build-snapshots.yml" 30 | build-python: 31 | name: "Build Python" 32 | needs: build-jvm 33 | uses: "./.github/workflows/build-python.yml" 34 | 35 | test-jvm: 36 | name: "Test JVM" 37 | needs: build-jvm 38 | uses: "./.github/workflows/test-jvm.yml" 39 | test-python: 40 | name: "Test Python" 41 | needs: build-jvm 42 | uses: "./.github/workflows/test-python.yml" 43 | test-snapshots-jvm: 44 | name: "Test Snapshots" 45 | needs: build-snapshots 46 | uses: "./.github/workflows/test-snapshots.yml" 47 | 48 | check: 49 | name: "Check" 50 | needs: build-jvm 51 | uses: "./.github/workflows/check.yml" 52 | 53 | test_success: 54 | name: "Test success" 55 | if: always() 56 | runs-on: ubuntu-latest 57 | needs: [build-jvm, build-python, test-jvm, test-python] 58 | 59 | steps: 60 | - name: "Success" 61 | if: success() 62 | run: true 63 | shell: bash 64 | - name: "Failure" 65 | if: failure() 66 | run: false 67 | shell: bash 68 | -------------------------------------------------------------------------------- /.github/workflows/clear-caches.yaml: -------------------------------------------------------------------------------- 1 | name: Clear caches 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | permissions: 7 | actions: write 8 | 9 | jobs: 10 | clear-cache: 11 | runs-on: ubuntu-latest 12 | steps: 13 | - name: Clear caches 14 | uses: actions/github-script@v7 15 | with: 16 | script: | 17 | const caches = await github.paginate( 18 | github.rest.actions.getActionsCacheList.endpoint.merge({ 19 | owner: context.repo.owner, 20 | repo: context.repo.repo, 21 | }) 22 | ) 23 | for (const cache of caches) { 24 | console.log(cache) 25 | github.rest.actions.deleteActionsCacheById({ 26 | owner: context.repo.owner, 27 | repo: context.repo.repo, 28 | cache_id: cache.id, 29 | }) 30 | } 31 | 32 | -------------------------------------------------------------------------------- /.github/workflows/prime-caches.yml: -------------------------------------------------------------------------------- 1 | name: Prime caches 2 | 3 | on: 4 | workflow_dispatch: 5 | 6 | jobs: 7 | test: 8 | name: Spark ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }} Scala ${{ matrix.scala-version }} 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | fail-fast: false 13 | # keep in-sync with .github/workflows/test-jvm.yml 14 | matrix: 15 | scala-compat-version: ['2.12', '2.13'] 16 | spark-compat-version: ['3.4', '3.5'] 17 | spark-patch-version: ['0', '1', '2', '3', '4'] 18 | 19 | include: 20 | - spark-compat-version: '3.0' 21 | scala-compat-version: '2.12' 22 | scala-version: '2.12.10' 23 | spark-patch-version: '3' 24 | hadoop-version: '2.7' 25 | - spark-compat-version: '3.1' 26 | scala-compat-version: '2.12' 27 | scala-version: '2.12.10' 28 | spark-patch-version: '3' 29 | hadoop-version: '2.7' 30 | - spark-compat-version: '3.2' 31 | scala-compat-version: '2.12' 32 | scala-version: '2.12.15' 33 | spark-patch-version: '4' 34 | hadoop-version: '2.7' 35 | - spark-compat-version: '3.3' 36 | scala-compat-version: '2.12' 37 | scala-version: '2.12.15' 38 | spark-patch-version: '4' 39 | hadoop-version: '3' 40 | - spark-compat-version: '3.4' 41 | scala-compat-version: '2.12' 42 | scala-version: '2.12.17' 43 | hadoop-version: '3' 44 | - spark-compat-version: '3.5' 45 | scala-compat-version: '2.12' 46 | scala-version: '2.12.18' 47 | hadoop-version: '3' 48 | - spark-compat-version: '3.5' 49 | scala-compat-version: '2.12' 50 | scala-version: '2.12.18' 51 | spark-patch-version: '5' 52 | hadoop-version: '3' 53 | 54 | - spark-compat-version: '3.2' 55 | scala-compat-version: '2.13' 56 | scala-version: '2.13.5' 57 | spark-patch-version: '4' 58 | hadoop-version: '3.2' 59 | - spark-compat-version: '3.3' 60 | scala-compat-version: '2.13' 61 | scala-version: '2.13.8' 62 | spark-patch-version: '4' 63 | hadoop-version: '3' 64 | - spark-compat-version: '3.4' 65 | scala-compat-version: '2.13' 66 | scala-version: '2.13.8' 67 | hadoop-version: '3' 68 | - spark-compat-version: '3.5' 69 | scala-compat-version: '2.13' 70 | scala-version: '2.13.8' 71 | hadoop-version: '3' 72 | - spark-compat-version: '3.5' 73 | scala-compat-version: '2.13' 74 | scala-version: '2.13.8' 75 | spark-patch-version: '5' 76 | hadoop-version: '3' 77 | 78 | steps: 79 | - name: Checkout 80 | uses: actions/checkout@v4 81 | 82 | - name: Prime caches 83 | uses: ./.github/actions/prime-caches 84 | with: 85 | spark-version: ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }} 86 | scala-version: ${{ matrix.scala-version }} 87 | spark-compat-version: ${{ matrix.spark-compat-version }} 88 | scala-compat-version: ${{ matrix.scala-compat-version }} 89 | hadoop-version: ${{ matrix.hadoop-version }} 90 | java-compat-version: '8' 91 | -------------------------------------------------------------------------------- /.github/workflows/publish-release.yml: -------------------------------------------------------------------------------- 1 | name: Publish release 2 | 3 | on: 4 | workflow_dispatch: 5 | inputs: 6 | versions: 7 | required: true 8 | type: string 9 | description: 'Example: {"include": [{"params": {"spark-version": "3.0.3","scala-version": "2.12.10"}}]}' 10 | default: | 11 | { 12 | "include": [ 13 | {"params": {"spark-version": "3.0.3","scala-version": "2.12.10"}}, 14 | {"params": {"spark-version": "3.1.3","scala-version": "2.12.10"}}, 15 | {"params": {"spark-version": "3.2.4","scala-version": "2.12.15"}}, 16 | {"params": {"spark-version": "3.3.4","scala-version": "2.12.15"}}, 17 | {"params": {"spark-version": "3.4.4","scala-version": "2.12.17"}}, 18 | {"params": {"spark-version": "3.5.5","scala-version": "2.12.18"}}, 19 | {"params": {"spark-version": "3.2.4","scala-version": "2.13.5"}}, 20 | {"params": {"spark-version": "3.3.4","scala-version": "2.13.8"}}, 21 | {"params": {"spark-version": "3.4.4","scala-version": "2.13.8"}}, 22 | {"params": {"spark-version": "3.5.5","scala-version": "2.13.8"}} 23 | ] 24 | } 25 | 26 | env: 27 | # PySpark 3 versions only work with Python 3.9 28 | PYTHON_VERSION: "3.9" 29 | 30 | jobs: 31 | maven-release: 32 | name: Publish maven release 33 | runs-on: ubuntu-latest 34 | if: ( ! github.event.repository.fork ) 35 | # secrets are provided by environment 36 | environment: 37 | name: release 38 | # a different URL for each point in the matrix, but the same URLs accross commits 39 | url: 'https://github.com/G-Research/spark-extension?spark=${{ matrix.params.spark-version }}&scala=${{ matrix.params.scala-version }}' 40 | permissions: 41 | id-token: write # required for PiPy publish 42 | strategy: 43 | fail-fast: false 44 | matrix: ${{ fromJson(github.event.inputs.versions) }} 45 | 46 | steps: 47 | - name: Checkout release tag 48 | uses: actions/checkout@v4 49 | 50 | - name: Get versions 51 | id: versions 52 | run: | 53 | # get release version 54 | version=$(grep --max-count=1 ".*" pom.xml | sed -E -e "s/\s*<[^>]+>//g" -e "s/-SNAPSHOT//" -e "s/-[0-9.]+//g") 55 | is_snapshot=$(if grep -q ".*-SNAPSHOT" pom.xml; then echo "true"; else echo "false"; fi) 56 | 57 | # share versions 58 | echo "release-tag=v${version}" >> "$GITHUB_OUTPUT" 59 | echo "is-snapshot=$is_snapshot" >> "$GITHUB_OUTPUT" 60 | 61 | - name: Check tag setup 62 | run: | 63 | # Check tag setup 64 | if [[ "$GITHUB_REF" != "refs/tags/v"* ]] 65 | then 66 | echo "This workflow must be run on a tag, not $GITHUB_REF" 67 | exit 1 68 | fi 69 | 70 | if [ "${{ steps.versions.outputs.is-snapshot }}" == "true" ] 71 | then 72 | echo "This is a tagged SNAPSHOT version. This is not allowed for release!" 73 | exit 1 74 | fi 75 | 76 | if [ "${{ github.ref_name }}" != "${{ steps.versions.outputs.release-tag }}" ] 77 | then 78 | echo "The version in the pom.xml is ${{ steps.versions.outputs.release-tag }}" 79 | echo "This tag is ${{ github.ref_name }}, which is different!" 80 | exit 1 81 | fi 82 | 83 | - name: Set up JDK and publish to Maven Central 84 | uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12 # v4.7.0 85 | with: 86 | java-version: '8' 87 | distribution: 'corretto' 88 | server-id: ossrh 89 | server-username: MAVEN_USERNAME 90 | server-password: MAVEN_PASSWORD 91 | gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }} 92 | gpg-passphrase: MAVEN_GPG_PASSPHRASE 93 | 94 | - name: Inspect GPG 95 | run: gpg -k 96 | 97 | - uses: actions/setup-python@v5 98 | with: 99 | python-version: ${{ env.PYTHON_VERSION }} 100 | 101 | - name: Restore Maven packages cache 102 | id: cache-maven 103 | uses: actions/cache/restore@v4 104 | with: 105 | path: ~/.m2/repository 106 | key: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}-${{ hashFiles('pom.xml') }} 107 | restore-keys: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}- 108 | 109 | - name: Publish maven artifacts 110 | id: publish-maven 111 | run: | 112 | ./set-version.sh ${{ matrix.params.spark-version }} ${{ matrix.params.scala-version }} 113 | mvn clean deploy -Dsign -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true 114 | env: 115 | MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }} 116 | MAVEN_PASSWORD: ${{ secrets.OSSRH_PASSWORD }} 117 | MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE}} 118 | 119 | - name: Prepare PyPi package 120 | id: prepare-pypi-package 121 | if: ${{ matrix.params.scala-version }} == 2.12* 122 | run: | 123 | ./build-whl.sh 124 | 125 | - name: Publish package distributions to PyPI 126 | uses: pypa/gh-action-pypi-publish@release/v1 127 | if: ${{ matrix.params.scala-version }} == 2.12* 128 | with: 129 | user: ${{ secrets.PYPI_USERNAME }} 130 | password: ${{ secrets.PYPI_PASSWORD }} 131 | packages-dir: python/dist 132 | verbose: true 133 | -------------------------------------------------------------------------------- /.github/workflows/publish-snapshot.yml: -------------------------------------------------------------------------------- 1 | name: Publish snapshot 2 | 3 | on: 4 | workflow_dispatch: 5 | push: 6 | branches: ["master"] 7 | 8 | env: 9 | PYTHON_VERSION: "3.10" 10 | 11 | jobs: 12 | check-version: 13 | name: Check SNAPSHOT version 14 | if: ( ! github.event.repository.fork ) 15 | runs-on: ubuntu-latest 16 | permissions: {} 17 | outputs: 18 | is-snapshot: ${{ steps.check.outputs.is-snapshot }} 19 | 20 | steps: 21 | - name: Checkout code 22 | uses: actions/checkout@v4 23 | 24 | - name: Check if this is a SNAPSHOT version 25 | id: check 26 | run: | 27 | # check is snapshot version 28 | if grep -q ".*-SNAPSHOT" pom.xml 29 | then 30 | echo "Version in pom IS a SNAPSHOT version" 31 | echo "is-snapshot=true" >> "$GITHUB_OUTPUT" 32 | else 33 | echo "Version in pom is NOT a SNAPSHOT version" 34 | echo "is-snapshot=false" >> "$GITHUB_OUTPUT" 35 | fi 36 | 37 | snapshot: 38 | name: Snapshot Spark ${{ matrix.params.spark-version }} Scala ${{ matrix.params.scala-version }} 39 | needs: check-version 40 | # when we release from master, this workflow will see a commit that does not have a SNAPSHOT version 41 | # we want this workflow to skip over that commit 42 | if: needs.check-version.outputs.is-snapshot == 'true' 43 | runs-on: ubuntu-latest 44 | # secrets are provided by environment 45 | environment: 46 | name: snapshot 47 | # a different URL for each point in the matrix, but the same URLs accross commits 48 | url: 'https://github.com/G-Research/spark-extension?spark=${{ matrix.params.spark-version }}&scala=${{ matrix.params.scala-version }}&snapshot' 49 | permissions: {} 50 | strategy: 51 | fail-fast: false 52 | matrix: 53 | include: 54 | - params: {"spark-version": "3.0.3", "scala-version": "2.12.10", "scala-compat-version": "2.12"} 55 | - params: {"spark-version": "3.1.3", "scala-version": "2.12.10", "scala-compat-version": "2.12"} 56 | - params: {"spark-version": "3.2.4", "scala-version": "2.12.15", "scala-compat-version": "2.12"} 57 | - params: {"spark-version": "3.3.4", "scala-version": "2.12.15", "scala-compat-version": "2.12"} 58 | - params: {"spark-version": "3.4.4", "scala-version": "2.12.17", "scala-compat-version": "2.12"} 59 | - params: {"spark-version": "3.5.5", "scala-version": "2.12.18", "scala-compat-version": "2.12"} 60 | - params: {"spark-version": "3.2.4", "scala-version": "2.13.5", "scala-compat-version": "2.13"} 61 | - params: {"spark-version": "3.3.4", "scala-version": "2.13.8", "scala-compat-version": "2.13"} 62 | - params: {"spark-version": "3.4.4", "scala-version": "2.13.8", "scala-compat-version": "2.13"} 63 | - params: {"spark-version": "3.5.5", "scala-version": "2.13.8", "scala-compat-version": "2.13"} 64 | 65 | steps: 66 | - name: Checkout code 67 | uses: actions/checkout@v4 68 | 69 | - name: Set up JDK and publish to Maven Central 70 | uses: actions/setup-java@3a4f6e1af504cf6a31855fa899c6aa5355ba6c12 # v4.7.0 71 | with: 72 | java-version: '8' 73 | distribution: 'corretto' 74 | server-id: ossrh 75 | server-username: MAVEN_USERNAME 76 | server-password: MAVEN_PASSWORD 77 | gpg-private-key: ${{ secrets.GPG_PRIVATE_KEY }} 78 | gpg-passphrase: MAVEN_GPG_PASSPHRASE 79 | 80 | - name: Inspect GPG 81 | run: gpg -k 82 | 83 | - uses: actions/setup-python@v5 84 | with: 85 | python-version: ${{ env.PYTHON_VERSION }} 86 | 87 | - name: Restore Maven packages cache 88 | id: cache-maven 89 | uses: actions/cache/restore@v4 90 | with: 91 | path: ~/.m2/repository 92 | key: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}-${{ hashFiles('pom.xml') }} 93 | restore-keys: ${{ runner.os }}-mvn-build-${{ matrix.params.spark-version }}-${{ matrix.params.scala-version }}- 94 | 95 | - name: Publish snapshot 96 | run: | 97 | ./set-version.sh ${{ matrix.params.spark-version }} ${{ matrix.params.scala-version }} 98 | mvn clean deploy -Dsign -Dspotless.check.skip -DskipTests -Dmaven.test.skip=true 99 | env: 100 | MAVEN_USERNAME: ${{ secrets.OSSRH_USERNAME }} 101 | MAVEN_PASSWORD: ${{ secrets.OSSRH_PASSWORD }} 102 | MAVEN_GPG_PASSPHRASE: ${{ secrets.GPG_PASSPHRASE}} 103 | 104 | - name: Prepare PyPi package to test snapshot 105 | if: ${{ matrix.params.scala-version }} == 2.12* 106 | run: | 107 | # Build whl 108 | ./build-whl.sh 109 | 110 | - name: Restore Spark Binaries cache 111 | uses: actions/cache/restore@v4 112 | with: 113 | path: ~/spark 114 | key: ${{ runner.os }}-spark-binaries-${{ matrix.params.spark-version }}-${{ matrix.params.scala-compat-version }} 115 | restore-keys: 116 | ${{ runner.os }}-spark-binaries-${{ matrix.params.spark-version }}-${{ matrix.params.scala-compat-version }} 117 | 118 | - name: Rename Spark Binaries cache 119 | run: | 120 | mv ~/spark ./spark-${{ matrix.params.spark-version }}-${{ matrix.params.scala-compat-version }} 121 | 122 | - name: Test snapshot 123 | id: test-package 124 | run: | 125 | # Test the snapshot (needs whl) 126 | ./test-release.sh 127 | -------------------------------------------------------------------------------- /.github/workflows/test-jvm.yml: -------------------------------------------------------------------------------- 1 | name: Test JVM 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | test: 8 | name: Test (Spark ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }} Scala ${{ matrix.scala-version }}) 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | fail-fast: false 13 | # keep in-sync with .github/workflows/prime-caches.yml 14 | matrix: 15 | scala-compat-version: ['2.12', '2.13'] 16 | spark-compat-version: ['3.4', '3.5'] 17 | spark-patch-version: ['0', '1', '2', '3', '4'] 18 | 19 | include: 20 | - spark-compat-version: '3.0' 21 | scala-compat-version: '2.12' 22 | scala-version: '2.12.10' 23 | spark-patch-version: '3' 24 | hadoop-version: '2.7' 25 | - spark-compat-version: '3.1' 26 | scala-compat-version: '2.12' 27 | scala-version: '2.12.10' 28 | spark-patch-version: '3' 29 | hadoop-version: '2.7' 30 | - spark-compat-version: '3.2' 31 | scala-compat-version: '2.12' 32 | scala-version: '2.12.15' 33 | spark-patch-version: '4' 34 | hadoop-version: '2.7' 35 | - spark-compat-version: '3.3' 36 | scala-compat-version: '2.12' 37 | scala-version: '2.12.15' 38 | spark-patch-version: '4' 39 | hadoop-version: '3' 40 | - spark-compat-version: '3.4' 41 | scala-compat-version: '2.12' 42 | scala-version: '2.12.17' 43 | hadoop-version: '3' 44 | - spark-compat-version: '3.5' 45 | scala-compat-version: '2.12' 46 | scala-version: '2.12.18' 47 | hadoop-version: '3' 48 | - spark-compat-version: '3.5' 49 | scala-compat-version: '2.12' 50 | scala-version: '2.12.18' 51 | spark-patch-version: '5' 52 | hadoop-version: '3' 53 | 54 | - spark-compat-version: '3.2' 55 | scala-compat-version: '2.13' 56 | scala-version: '2.13.5' 57 | spark-patch-version: '4' 58 | hadoop-version: '3.2' 59 | - spark-compat-version: '3.3' 60 | scala-compat-version: '2.13' 61 | scala-version: '2.13.8' 62 | spark-patch-version: '4' 63 | hadoop-version: '3' 64 | - spark-compat-version: '3.4' 65 | scala-compat-version: '2.13' 66 | scala-version: '2.13.8' 67 | hadoop-version: '3' 68 | - spark-compat-version: '3.5' 69 | scala-compat-version: '2.13' 70 | scala-version: '2.13.8' 71 | hadoop-version: '3' 72 | - spark-compat-version: '3.5' 73 | scala-compat-version: '2.13' 74 | scala-version: '2.13.8' 75 | spark-patch-version: '5' 76 | hadoop-version: '3' 77 | 78 | steps: 79 | - name: Checkout 80 | uses: actions/checkout@v4 81 | 82 | - name: Test 83 | uses: ./.github/actions/test-jvm 84 | env: 85 | CI_SLOW_TESTS: 1 86 | with: 87 | spark-version: ${{ matrix.spark-compat-version }}.${{ matrix.spark-patch-version }} 88 | scala-version: ${{ matrix.scala-version }} 89 | spark-compat-version: ${{ matrix.spark-compat-version }} 90 | scala-compat-version: ${{ matrix.scala-compat-version }} 91 | hadoop-version: ${{ matrix.hadoop-version }} 92 | java-compat-version: '8' 93 | -------------------------------------------------------------------------------- /.github/workflows/test-python.yml: -------------------------------------------------------------------------------- 1 | name: Test Python 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | # pyspark is not available for snapshots or scala other than 2.12 8 | # we would have to compile spark from sources for this, not worth it 9 | test: 10 | name: Test (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }} Python ${{ matrix.python-version }}) 11 | runs-on: ubuntu-latest 12 | 13 | strategy: 14 | fail-fast: false 15 | matrix: 16 | spark-compat-version: ['3.2', '3.3', '3.4', '3.5'] 17 | python-version: ['3.9', '3.10', '3.11', '3.12', '3.13'] 18 | 19 | include: 20 | - spark-compat-version: '3.0' 21 | spark-version: '3.0.3' 22 | hadoop-version: '2.7' 23 | scala-compat-version: '2.12' 24 | scala-version: '2.12.10' 25 | python-version: '3.8' 26 | - spark-compat-version: '3.1' 27 | spark-version: '3.1.3' 28 | hadoop-version: '2.7' 29 | scala-compat-version: '2.12' 30 | scala-version: '2.12.10' 31 | python-version: '3.8' 32 | - spark-compat-version: '3.2' 33 | spark-version: '3.2.4' 34 | hadoop-version: '2.7' 35 | scala-compat-version: '2.12' 36 | scala-version: '2.12.15' 37 | - spark-compat-version: '3.3' 38 | spark-version: '3.3.4' 39 | hadoop-version: '3' 40 | scala-compat-version: '2.12' 41 | scala-version: '2.12.15' 42 | - spark-compat-version: '3.4' 43 | spark-version: '3.4.4' 44 | hadoop-version: '3' 45 | scala-compat-version: '2.12' 46 | scala-version: '2.12.17' 47 | - spark-compat-version: '3.5' 48 | spark-version: '3.5.5' 49 | hadoop-version: '3' 50 | scala-compat-version: '2.12' 51 | scala-version: '2.12.18' 52 | 53 | exclude: 54 | - spark-compat-version: '3.2' 55 | python-version: '3.10' 56 | - spark-compat-version: '3.2' 57 | python-version: '3.11' 58 | - spark-compat-version: '3.2' 59 | python-version: '3.12' 60 | - spark-compat-version: '3.2' 61 | python-version: '3.13' 62 | 63 | - spark-compat-version: '3.3' 64 | python-version: '3.11' 65 | - spark-compat-version: '3.3' 66 | python-version: '3.12' 67 | - spark-compat-version: '3.3' 68 | python-version: '3.13' 69 | 70 | - spark-compat-version: '3.4' 71 | python-version: '3.12' 72 | - spark-compat-version: '3.4' 73 | python-version: '3.13' 74 | 75 | - spark-compat-version: '3.5' 76 | python-version: '3.12' 77 | - spark-compat-version: '3.5' 78 | python-version: '3.13' 79 | 80 | steps: 81 | - name: Checkout 82 | uses: actions/checkout@v4 83 | 84 | - name: Test 85 | uses: ./.github/actions/test-python 86 | with: 87 | spark-version: ${{ matrix.spark-version }} 88 | scala-version: ${{ matrix.scala-version }} 89 | spark-compat-version: ${{ matrix.spark-compat-version }} 90 | scala-compat-version: ${{ matrix.scala-compat-version }} 91 | hadoop-version: ${{ matrix.hadoop-version }} 92 | python-version: ${{ matrix.python-version }} 93 | -------------------------------------------------------------------------------- /.github/workflows/test-results.yml: -------------------------------------------------------------------------------- 1 | name: Test Results 2 | 3 | on: 4 | workflow_run: 5 | workflows: ["CI"] 6 | types: 7 | - completed 8 | permissions: {} 9 | 10 | jobs: 11 | publish-test-results: 12 | name: Publish Test Results 13 | runs-on: ubuntu-latest 14 | if: github.event.workflow_run.conclusion != 'skipped' 15 | permissions: 16 | checks: write 17 | pull-requests: write 18 | 19 | steps: 20 | - name: Download and Extract Artifacts 21 | uses: dawidd6/action-download-artifact@09f2f74827fd3a8607589e5ad7f9398816f540fe 22 | with: 23 | run_id: ${{ github.event.workflow_run.id }} 24 | name: "^Event File$| Test Results " 25 | name_is_regexp: true 26 | path: artifacts 27 | 28 | - name: Publish Test Results 29 | uses: EnricoMi/publish-unit-test-result-action@v2 30 | with: 31 | commit: ${{ github.event.workflow_run.head_sha }} 32 | event_file: artifacts/Event File/event.json 33 | event_name: ${{ github.event.workflow_run.event }} 34 | junit_files: "artifacts/* Test Results*/**/*.xml" 35 | -------------------------------------------------------------------------------- /.github/workflows/test-snapshots.yml: -------------------------------------------------------------------------------- 1 | name: Test Snapshots 2 | 3 | on: 4 | workflow_call: 5 | 6 | jobs: 7 | test: 8 | name: Test (Spark ${{ matrix.spark-version }} Scala ${{ matrix.scala-version }}) 9 | runs-on: ubuntu-latest 10 | 11 | strategy: 12 | fail-fast: false 13 | matrix: 14 | include: 15 | - spark-compat-version: '3.2' 16 | spark-version: '3.2.5-SNAPSHOT' 17 | scala-compat-version: '2.12' 18 | scala-version: '2.12.15' 19 | java-compat-version: '8' 20 | - spark-compat-version: '3.3' 21 | spark-version: '3.3.5-SNAPSHOT' 22 | scala-compat-version: '2.12' 23 | scala-version: '2.12.15' 24 | java-compat-version: '8' 25 | - spark-compat-version: '3.4' 26 | spark-version: '3.4.5-SNAPSHOT' 27 | scala-compat-version: '2.12' 28 | scala-version: '2.12.17' 29 | java-compat-version: '8' 30 | - spark-compat-version: '3.5' 31 | spark-version: '3.5.6-SNAPSHOT' 32 | scala-compat-version: '2.12' 33 | scala-version: '2.12.17' 34 | java-compat-version: '8' 35 | 36 | - spark-compat-version: '3.2' 37 | spark-version: '3.2.5-SNAPSHOT' 38 | scala-compat-version: '2.13' 39 | scala-version: '2.13.5' 40 | java-compat-version: '8' 41 | - spark-compat-version: '3.3' 42 | spark-version: '3.3.5-SNAPSHOT' 43 | scala-compat-version: '2.13' 44 | scala-version: '2.13.8' 45 | java-compat-version: '8' 46 | - spark-compat-version: '3.4' 47 | spark-version: '3.4.5-SNAPSHOT' 48 | scala-compat-version: '2.13' 49 | scala-version: '2.13.8' 50 | java-compat-version: '8' 51 | - spark-compat-version: '3.5' 52 | spark-version: '3.5.6-SNAPSHOT' 53 | scala-compat-version: '2.13' 54 | scala-version: '2.13.8' 55 | java-compat-version: '8' 56 | - spark-compat-version: '4.0' 57 | spark-version: '4.0.1-SNAPSHOT' 58 | scala-compat-version: '2.13' 59 | scala-version: '2.13.16' 60 | java-compat-version: '17' 61 | - spark-compat-version: '4.1' 62 | spark-version: '4.1.0-SNAPSHOT' 63 | scala-compat-version: '2.13' 64 | scala-version: '2.13.16' 65 | java-compat-version: '17' 66 | 67 | steps: 68 | - name: Checkout 69 | uses: actions/checkout@v4 70 | 71 | - name: Test 72 | uses: ./.github/actions/test-jvm 73 | env: 74 | CI_SLOW_TESTS: 1 75 | with: 76 | spark-version: ${{ matrix.spark-version }} 77 | scala-version: ${{ matrix.scala-version }} 78 | spark-compat-version: ${{ matrix.spark-compat-version }}-SNAPSHOT 79 | scala-compat-version: ${{ matrix.scala-compat-version }} 80 | java-compat-version: ${{ matrix.java-compat-version }} 81 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # use glob syntax. 2 | syntax: glob 3 | *.ser 4 | *.class 5 | *~ 6 | *.bak 7 | #*.off 8 | *.old 9 | 10 | # eclipse conf file 11 | .settings 12 | .classpath 13 | .project 14 | .manager 15 | .scala_dependencies 16 | 17 | # idea 18 | .idea 19 | *.iml 20 | 21 | # building 22 | target 23 | build 24 | null 25 | tmp* 26 | temp* 27 | dist 28 | test-output 29 | build.log 30 | 31 | # other scm 32 | .svn 33 | .CVS 34 | .hg* 35 | 36 | # switch to regexp syntax. 37 | # syntax: regexp 38 | # ^\.pc/ 39 | 40 | #SHITTY output not in target directory 41 | build.log 42 | 43 | # project specific 44 | python/**/__pycache__ 45 | python/requirements.txt 46 | spark-* 47 | .cache -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.7.17 2 | runner.dialect = scala213 3 | rewrite.trailingCommas.style = keep 4 | docstrings.style = Asterisk 5 | maxColumn = 120 6 | 7 | -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | All notable changes to this project will be documented in this file. 3 | 4 | The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). 5 | 6 | ## [2.13.0] - 2024-11-04 7 | 8 | ### Fixes 9 | - Support diff for Spark Connect implemened via PySpark Dataset API (#251) 10 | 11 | ### Added 12 | - Add ignore columns to diff in Python API (#252) 13 | - Check that the Java / Scala package is installed when needed by Python (#250) 14 | 15 | ## [2.12.0] - 2024-04-26 16 | 17 | ## Fixes 18 | 19 | - Diff change column should respect comparators (#238) 20 | 21 | ## Changed 22 | 23 | - Make create_temporary_dir work with pyspark-extension only (#222). 24 | This allows [installing PIP packages and Poetry projects](PYSPARK-DEPS.md) 25 | via pure Python spark-extension package (Maven package not required any more). 26 | - Add map diff comparator to Python API (#226) 27 | 28 | ## [2.11.0] - 2024-01-04 29 | 30 | ### Added 31 | 32 | - Add count_null aggregate function (#206) 33 | - Support reading parquet schema (#208) 34 | - Add more columns to reading parquet metadata (#209, #211) 35 | - Provide groupByKey shortcuts for groupBy.as (#213) 36 | - Allow to install PIP packages into PySpark job (#215) 37 | - Allow to install Poetry projects into PySpark job (#216) 38 | 39 | ## [2.10.0] - 2023-09-27 40 | 41 | ### Fixed 42 | 43 | - Update setup.py to include parquet methods in python package (#191) 44 | 45 | ### Added 46 | 47 | - Add --statistics option to diff app (#189) 48 | - Add --filter option to diff app (#190) 49 | 50 | ## [2.9.0] - 2023-08-23 51 | 52 | ### Added 53 | 54 | - Add key order sensitive map comparator (#187) 55 | 56 | ### Changed 57 | 58 | - Use dataset encoder rather than implicit value encoder for implicit dataset extension class (#183) 59 | 60 | ### Fixed 61 | 62 | - Fix key-sensitivity in map comparator (#186) 63 | 64 | ## [2.8.0] - 2023-05-24 65 | 66 | ### Added 67 | 68 | - Add method to set and automatically unset Spark job description. (#172) 69 | - Add column function that converts between .Net (C#, F#, Visual Basic) `DateTime.Ticks` and Spark timestamp / Unix epoch timestamps. (#153) 70 | 71 | ## [2.7.0] - 2023-05-05 72 | 73 | ### Added 74 | 75 | - Spark app to diff files or tables and write result back to file or table. (#160) 76 | - Add null value count to `parquetBlockColumns` and `parquet_block_columns`. (#162) 77 | - Add `parallelism` argument to Parquet metadata methods. (#164) 78 | 79 | ### Changed 80 | 81 | - Change data type of column name in `parquetBlockColumns` and `parquet_block_columns` to array of strings. 82 | Cast to string to get earlier behaviour (string column name). (#162) 83 | 84 | ## [2.6.0] - 2023-04-11 85 | 86 | ### Added 87 | 88 | - Add reader for parquet metadata. (#154) 89 | 90 | ## [2.5.0] - 2023-03-23 91 | 92 | ### Added 93 | 94 | - Add whitespace agnostic diff comparator. (#137) 95 | - Add Python whl package build. (#151) 96 | 97 | ## [2.4.0] - 2022-12-08 98 | 99 | ### Added 100 | 101 | - Allow for custom diff equality. (#127) 102 | 103 | ### Fixed 104 | 105 | - Fix Python API calling into Scala code. (#132) 106 | 107 | ## [2.3.0] - 2022-10-26 108 | 109 | ### Added 110 | 111 | - Add diffWith to Scala, Java and Python Diff API. (#109) 112 | 113 | ### Changed 114 | 115 | - Diff similar Datasets with ignoreColumns. Before, only similar DataFrame could be diffed with ignoreColumns. (#111) 116 | 117 | ### Fixed 118 | 119 | - Cache before writing via partitionedBy to work around SPARK-40588. Unpersist via UnpersistHandle. (#124) 120 | 121 | ## [2.2.0] - 2022-07-21 122 | 123 | ### Added 124 | - Add (global) row numbers transformation to Scala, Java and Python API. (#97) 125 | 126 | ### Removed 127 | - Removed support for Pyton 3.6 128 | 129 | ## [2.1.0] - 2022-04-07 130 | 131 | ### Added 132 | - Add sorted group methods to Dataset. (#76) 133 | 134 | ## [2.0.0] - 2021-10-29 135 | 136 | ### Added 137 | - Add support for Spark 3.2 and Scala 2.13. 138 | - Support to ignore columns in diff API. (#63) 139 | 140 | ### Removed 141 | - Removed support for Spark 2.4. 142 | 143 | ## [1.3.3] - 2020-12-17 144 | 145 | ### Added 146 | - Add support for Spark 3.1. 147 | 148 | ## [1.3.2] - 2020-12-17 149 | 150 | ### Changed 151 | - Refine conditional transformation helper methods. 152 | 153 | ## [1.3.1] - 2020-12-10 154 | 155 | ### Changed 156 | - Refine conditional transformation helper methods. 157 | 158 | ## [1.3.0] - 2020-12-07 159 | 160 | ### Added 161 | - Add transformation to compute histogram. (#26) 162 | - Add conditional transformation helper methods. (#27) 163 | - Add partitioned writing helpers that simplifies writing optimally ordered partitioned data. (#29) 164 | 165 | ## [1.2.0] - 2020-10-06 166 | 167 | ### Added 168 | - Add diff modes (#22): column-by-column, side-by-side, left and right side diff modes. 169 | - Adds sparse mode (#23): diff DataFrame contains only changed values. 170 | 171 | ## [1.1.0] - 2020-08-24 172 | 173 | ### Added 174 | - Add Python API for Diff transformation. 175 | - Add change column to Diff transformation providing column names of all changed columns in a row. 176 | - Add fluent methods to change immutable diff options. 177 | - Add `backticks` method to handle column names that contain dots (`.`). 178 | 179 | ## [1.0.0] - 2020-03-12 180 | 181 | ### Added 182 | - Add Diff transformation for Datasets. 183 | -------------------------------------------------------------------------------- /CONDITIONAL.md: -------------------------------------------------------------------------------- 1 | # DataFrame Transformations 2 | 3 | The Spark `Dataset` API allows for chaining transformations as in the following example: 4 | 5 | ```scala 6 | ds.where($"id" === 1) 7 | .withColumn("state", lit("new")) 8 | .orderBy($"timestamp") 9 | ``` 10 | 11 | When you define additional transformation functions, the `Dataset` API allows you to 12 | also fluently call into those: 13 | 14 | ```scala 15 | def transformation(df: DataFrame): DataFrame = df.distinct 16 | 17 | ds.transform(transformation) 18 | ``` 19 | 20 | Here are some methods that extend this principle to conditional calls. 21 | 22 | ## Conditional Transformations 23 | 24 | You can run a transformation after checking a condition with a chain of fluent transformation calls: 25 | 26 | ```scala 27 | import uk.co.gresearch._ 28 | 29 | val condition = true 30 | 31 | val result = 32 | ds.where($"id" === 1) 33 | .withColumn("state", lit("new")) 34 | .when(condition).call(transformation) 35 | .orderBy($"timestamp") 36 | ``` 37 | 38 | rather than 39 | 40 | ```scala 41 | val condition = true 42 | 43 | val filteredDf = ds.where($"id" === 1) 44 | .withColumn("state", lit("new")) 45 | val condDf = if (condition) ds.call(transformation) else ds 46 | val result = ds.orderBy($"timestamp") 47 | ``` 48 | 49 | In case you need an else transformation as well, try: 50 | 51 | ```scala 52 | import uk.co.gresearch._ 53 | 54 | val condition = true 55 | 56 | val result = 57 | ds.where($"id" === 1) 58 | .withColumn("state", lit("new")) 59 | .on(condition).either(transformation).or(other) 60 | .orderBy($"timestamp") 61 | ``` 62 | 63 | ## Fluent and conditional functions elsewhere 64 | 65 | The same fluent notation works for instances other than `Dataset` or `DataFrame`, e.g. 66 | for the `DataFrameWriter`: 67 | 68 | ```scala 69 | def writeData[T](writer: DataFrameWriter[T]): Unit = { ... } 70 | 71 | ds.write 72 | .when(compress).call(_.option("compression", "gzip")) 73 | .call(writeData) 74 | ``` 75 | -------------------------------------------------------------------------------- /GROUPS.md: -------------------------------------------------------------------------------- 1 | # Sorted Groups 2 | 3 | Spark provides the ability to group rows by an arbitrary key, 4 | while then providing an iterator for each of these groups. 5 | This allows to iterate over groups that are too large to fit into memory: 6 | 7 | ```scala 8 | import org.apache.spark.sql.Dataset 9 | 10 | import spark.implicits._ 11 | 12 | case class Val(id: Int, seq: Int, value: Double) 13 | 14 | val ds: Dataset[Val] = Seq( 15 | Val(1, 1, 1.1), 16 | Val(1, 2, 1.2), 17 | Val(1, 3, 1.3), 18 | 19 | Val(2, 1, 2.1), 20 | Val(2, 2, 2.2), 21 | Val(2, 3, 2.3), 22 | 23 | Val(3, 1, 3.1) 24 | ).reverse.toDS().repartition(3).cache() 25 | 26 | // order of iterator IS NOT guaranteed 27 | ds.groupByKey(v => v.id) 28 | .flatMapGroups((key, it) => it.zipWithIndex.map(v => (key, v._2, v._1.seq, v._1.value))) 29 | .toDF("key", "index", "seq", "value") 30 | .show(false) 31 | 32 | +---+-----+---+-----+ 33 | |key|index|seq|value| 34 | +---+-----+---+-----+ 35 | |1 |0 |3 |1.3 | 36 | |1 |1 |2 |1.2 | 37 | |1 |2 |1 |1.1 | 38 | |2 |0 |1 |2.1 | 39 | |2 |1 |3 |2.3 | 40 | |2 |2 |2 |2.2 | 41 | |3 |0 |1 |3.1 | 42 | +---+-----+---+-----+ 43 | ``` 44 | 45 | However, we have no control over the order of the group iterators. 46 | If we want the iterators to be ordered according to `seq`, we can do the following: 47 | 48 | ```scala 49 | import uk.co.gresearch.spark._ 50 | 51 | // the group key $"id" needs an ordering 52 | implicit val ordering: Ordering.Int.type = Ordering.Int 53 | 54 | // order of iterator IS guaranteed 55 | ds.groupBySorted($"id")($"seq") 56 | .flatMapSortedGroups((key, it) => it.zipWithIndex.map(v => (key, v._2, v._1.seq, v._1.value))) 57 | .toDF("key", "index", "seq", "value") 58 | .show(false) 59 | 60 | +---+-----+---+-----+ 61 | |key|index|seq|value| 62 | +---+-----+---+-----+ 63 | |1 |0 |1 |1.1 | 64 | |1 |1 |2 |1.2 | 65 | |1 |2 |3 |1.3 | 66 | |2 |0 |1 |2.1 | 67 | |2 |1 |2 |2.2 | 68 | |2 |2 |3 |2.3 | 69 | |3 |0 |1 |3.1 | 70 | +---+-----+---+-----+ 71 | ``` 72 | 73 | Now, iterators are ordered according to `seq`, which is proven by the value of `index`, 74 | that has been generated by `it.zipWithIndex`. 75 | 76 | Instead of column expressions, we can also use lambdas to define group key and group order: 77 | ```scala 78 | ds.groupByKeySorted(v => v.id)(v => v.seq) 79 | .flatMapSortedGroups((key, it) => it.zipWithIndex.map(v => (key, v._2, v._1.seq, v._1.value))) 80 | .toDF("key", "index", "seq", "value") 81 | .show(false) 82 | ``` 83 | 84 | **Note:** Using lambdas here hides from Spark which columns we use for grouping and sorting. 85 | Query optimization cannot improve partitioning and sorting in this case. Use column expressions when possible. 86 | -------------------------------------------------------------------------------- /HISTOGRAM.md: -------------------------------------------------------------------------------- 1 | # Histogram 2 | 3 | For a table `df` like 4 | 5 | |user |score| 6 | |:-----:|:---:| 7 | |Alice |101 | 8 | |Alice |221 | 9 | |Alice |211 | 10 | |Alice |176 | 11 | |Bob |276 | 12 | |Bob |232 | 13 | |Bon |258 | 14 | |Charlie|221 | 15 | 16 | you can compute the histogram for each user 17 | 18 | |user |≤100 |≤200 |>200 | 19 | |:-----:|:---:|:---:|:---:| 20 | |Alice |0 |2 |2 | 21 | |Bob |0 |0 |3 | 22 | |Charlie|0 |0 |1 | 23 | 24 | as follows: 25 | 26 | df.withColumn("≤100", when($"score" <= 100, 1).otherwise(0)) 27 | .withColumn("≤200", when($"score" > 100 && $"score" <= 200, 1).otherwise(0)) 28 | .withColumn(">200", when($"score" > 200, 1).otherwise(0)) 29 | .groupBy($"user") 30 | .agg( 31 | sum($"≤100").as("≤100"), 32 | sum($"≤200").as("≤200"), 33 | sum($">200").as(">200") 34 | ) 35 | .orderBy($"user") 36 | 37 | Equivalent to that query is: 38 | 39 | import uk.co.gresearch.spark._ 40 | 41 | df.histogram(Seq(100, 200), $"score", $"user").orderBy($"user") 42 | 43 | The first argument is a sequence of thresholds, the second argument provides the value column. 44 | The subsequent arguments refer to the aggregation columns (`groupBy`). Only aggregation columns 45 | will be in the result DataFrame. 46 | 47 | In Java, call: 48 | 49 | import uk.co.gresearch.spark.Histogram; 50 | 51 | Histogram.of(df, Arrays.asList(100, 200), new Column("score")), new Column("user")).orderBy($"user") 52 | 53 | In Python, call: 54 | 55 | import gresearch.spark 56 | 57 | df.histogram([100, 200], 'user').orderBy('user') 58 | 59 | Note that this feature is not supported in Python when connected with a [Spark Connect server](README.md#spark-connect-server). 60 | -------------------------------------------------------------------------------- /MAINTAINERS.md: -------------------------------------------------------------------------------- 1 | ## Current maintainers of the project 2 | 3 | | Maintainer | GitHub ID | 4 | | ---------------------- | ------------------------------------------------------- | 5 | | Enrico Minack | [EnricoMi](https://github.com/EnricoMi) | 6 | -------------------------------------------------------------------------------- /PYSPARK-DEPS.md: -------------------------------------------------------------------------------- 1 | # PySpark dependencies 2 | 3 | Using PySpark on a cluster requires all cluster nodes to have those Python packages installed that are required by the PySpark job. 4 | Such a deployment can be cumbersome, especially when running in an interactive notebook. 5 | 6 | The `spark-extension` package allows installing Python packages programmatically by the PySpark application itself (PySpark ≥ 3.1.0). 7 | These packages are only accessible by that PySpark application, and they are removed on calling `spark.stop()`. 8 | 9 | Either install the `spark-extension` Maven package, or the `pyspark-extension` PyPi package (on the driver only), 10 | as described [here](README.md#using-spark-extension). 11 | 12 | ## Installing packages with `pip` 13 | 14 | Python packages can be installed with `pip` as follows: 15 | 16 | ```python 17 | # noinspection PyUnresolvedReferences 18 | from gresearch.spark import * 19 | 20 | spark.install_pip_package("pandas", "pyarrow") 21 | ``` 22 | 23 | Above example installs PIP packages `pandas` and `pyarrow` via `pip`. Method `install_pip_package` takes any `pip` command line argument: 24 | 25 | ```python 26 | # install packages with version specs 27 | spark.install_pip_package("pandas==1.4.3", "pyarrow~=8.0.0") 28 | 29 | # install packages from package sources (e.g. git clone https://github.com/pandas-dev/pandas.git) 30 | spark.install_pip_package("./pandas/") 31 | 32 | # install packages from git repo 33 | spark.install_pip_package("git+https://github.com/pandas-dev/pandas.git@main") 34 | 35 | # use a pip cache directory to cache downloaded and built whl files 36 | spark.install_pip_package("pandas", "pyarrow", "--cache-dir", "/home/user/.cache/pip") 37 | 38 | # use an alternative index url (other than https://pypi.org/simple) 39 | spark.install_pip_package("pandas", "pyarrow", "--index-url", "https://artifacts.company.com/pypi/simple") 40 | 41 | # install pip packages quietly (only disables output of PIP) 42 | spark.install_pip_package("pandas", "pyarrow", "--quiet") 43 | ``` 44 | 45 | ## Installing Python projects with Poetry 46 | 47 | Python projects can be installed from sources, including their dependencies, using [Poetry](https://python-poetry.org/): 48 | 49 | ```python 50 | # noinspection PyUnresolvedReferences 51 | from gresearch.spark import * 52 | 53 | spark.install_poetry_project("../my-poetry-project/", poetry_python="../venv-poetry/bin/python") 54 | ``` 55 | 56 | ## Example 57 | 58 | This example uses `install_pip_package` in a Spark standalone cluster. 59 | 60 | First checkout the example code: 61 | 62 | ```shell 63 | git clone https://github.com/G-Research/spark-extension.git 64 | cd spark-extension/examples/python-deps 65 | ``` 66 | 67 | Build a Docker image based on the official Spark release: 68 | ```shell 69 | docker build -t spark-extension-example-docker . 70 | ``` 71 | 72 | Start the example Spark standalone cluster consisting of a Spark master and one worker: 73 | ```shell 74 | docker compose -f docker-compose.yml up -d 75 | ``` 76 | 77 | Run the `example.py` Spark application on the example cluster: 78 | ```shell 79 | docker exec spark-master spark-submit --master spark://master:7077 --packages uk.co.gresearch.spark:spark-extension_2.12:2.13.0-3.5 /example/example.py 80 | ``` 81 | The `--packages uk.co.gresearch.spark:spark-extension_2.12:2.13.0-3.5` argument 82 | tells `spark-submit` to add the `spark-extension` Maven package to the Spark job. 83 | 84 | Alternatively, install the `pyspark-extension` PyPi package via `pip install` and remove the `--packages` argument from `spark-submit`: 85 | ```shell 86 | docker exec spark-master pip install --user pyspark_extension==2.11.1.3.5 87 | docker exec spark-master spark-submit --master spark://master:7077 /example/example.py 88 | ``` 89 | 90 | This output proves that PySpark could call into the function `func`, wich only works when Pandas and PyArrow are installed: 91 | ``` 92 | +---+ 93 | | id| 94 | +---+ 95 | | 0| 96 | | 1| 97 | | 2| 98 | +---+ 99 | ``` 100 | 101 | Test that `spark.install_pip_package("pandas", "pyarrow")` is really required by this example by removing this line from `example.py` … 102 | ```diff 103 | from pyspark.sql import SparkSession 104 | 105 | def main(): 106 | spark = SparkSession.builder.appName("spark_app").getOrCreate() 107 | 108 | def func(df): 109 | return df 110 | 111 | from gresearch.spark import install_pip_package 112 | 113 | - spark.install_pip_package("pandas", "pyarrow") 114 | spark.range(0, 3, 1, 5).mapInPandas(func, "id long").show() 115 | 116 | if __name__ == "__main__": 117 | main() 118 | ``` 119 | 120 | … and running the `spark-submit` command again. The example does not work anymore, 121 | because the Pandas and PyArrow packages are missing from the driver: 122 | ``` 123 | Traceback (most recent call last): 124 | File "/opt/spark/python/lib/pyspark.zip/pyspark/sql/pandas/utils.py", line 27, in require_minimum_pandas_version 125 | ModuleNotFoundError: No module named 'pandas' 126 | ``` 127 | 128 | Finally, shutdown the example cluster: 129 | ```shell 130 | docker compose -f docker-compose.yml down 131 | ``` 132 | 133 | ## Known Issues 134 | 135 | Note that this feature is not supported in Python when connected with a [Spark Connect server](README.md#spark-connect-server). 136 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | # Security and Coordinated Vulnerability Disclosure Policy 2 | 3 | This project appreciates and encourages coordinated disclosure of security vulnerabilities. We prefer that you use the GitHub reporting mechanism to privately report vulnerabilities. Under the main repository's security tab, click "Report a vulnerability" to open the advisory form. 4 | 5 | If you are unable to report it via GitHub, have received no response after repeated attempts, or have other security related questions, please contact security@gr-oss.io and mention this project in the subject line. -------------------------------------------------------------------------------- /build-whl.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -eo pipefail 4 | 5 | base=$(cd "$(dirname "$0")"; pwd) 6 | 7 | version=$(grep --max-count=1 ".*" "$base/pom.xml" | sed -E -e "s/\s*<[^>]+>//g") 8 | artifact_id=$(grep --max-count=1 ".*" "$base/pom.xml" | sed -E -e "s/\s*<[^>]+>//g") 9 | 10 | rm -rf "$base/python/pyspark/jars/" 11 | mkdir -p "$base/python/pyspark/jars/" 12 | cp -v "$base/target/$artifact_id-$version.jar" "$base/python/pyspark/jars/" 13 | if [ $(ls -1 "$base/python/pyspark/jars/" | wc -l) -ne 1 ] 14 | then 15 | echo "There are more than one jar in '$base/python/pyspark/jars/'" 16 | ls -lah "$base/python/pyspark/jars/" 17 | exit 1 18 | fi 19 | 20 | pip install build 21 | python -m build "$base/python/" 22 | 23 | # check for missing modules in whl file 24 | pyversion=${version/SNAPSHOT/dev0} 25 | pyversion=${pyversion//-/.} 26 | 27 | missing="$(diff <(cd $base/python; find gresearch -type f | grep -v ".pyc$" | sort) <(unzip -l $base/python/dist/pyspark_extension-${pyversion}-*.whl | tail -n +4 | head -n -2 | sed -E -e "s/^ +//" -e "s/ +/ /g" | cut -d " " -f 4- | sort) | grep "^<" || true)" 28 | if [ -n "$missing" ] 29 | then 30 | echo "These files are missing from the whl file:" 31 | echo "$missing" 32 | exit 1 33 | fi 34 | -------------------------------------------------------------------------------- /bump-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # 3 | # Copyright 2020 G-Research 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | # 17 | 18 | # Script to prepare release, see RELEASE.md for details 19 | 20 | set -e -o pipefail 21 | 22 | # check for clean git status 23 | readarray -t git_status < <(git status -s --untracked-files=no 2>/dev/null) 24 | if [ ${#git_status[@]} -gt 0 ] 25 | then 26 | echo "There are pending git changes:" 27 | for (( i=0; i<${#git_status[@]}; i++ )); do echo "${git_status[$i]}" ; done 28 | exit 1 29 | fi 30 | 31 | function next_version { 32 | local version=$1 33 | local branch=$2 34 | 35 | patch=${version/*./} 36 | majmin=${version%.${patch}} 37 | 38 | if [[ $branch == "master" ]] 39 | then 40 | # minor version bump 41 | if [[ $version != *".0" ]] 42 | then 43 | echo "version is patch version, should be M.m.0: $version" >&2 44 | exit 1 45 | fi 46 | maj=${version/.*/} 47 | min=${majmin#${maj}.} 48 | next=${maj}.$((min+1)).0 49 | echo "$next" 50 | else 51 | # patch version bump 52 | next=${majmin}.$((patch+1)) 53 | echo "$next" 54 | fi 55 | } 56 | 57 | # get release and next version 58 | version=$(grep --max-count=1 ".*" pom.xml | sed -E -e "s/\s*<[^>]+>//g") 59 | pkg_version="${version/-*/}" 60 | branch=$(git rev-parse --abbrev-ref HEAD) 61 | next_pkg_version="$(next_version "$pkg_version" "$branch")" 62 | 63 | # bump the version 64 | echo "Bump version to $next_pkg_version" 65 | ./set-version.sh $next_pkg_version-SNAPSHOT 66 | 67 | # commit changes to local repo 68 | echo 69 | echo "Committing release to local git" 70 | git commit -a -m "Post-release version bump to $next_pkg_version" 71 | git show HEAD 72 | echo 73 | 74 | # push version bump to origin 75 | echo "Press to push commit to origin" 76 | read 77 | 78 | echo "Pushing release commit to origin" 79 | git push origin "master" 80 | echo 81 | -------------------------------------------------------------------------------- /examples/python-deps/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/spark:3.5.0 2 | 3 | ENV PATH="${PATH}:/opt/spark/bin" 4 | 5 | USER root 6 | RUN mkdir -p /home/spark; chown spark:spark /home/spark 7 | USER spark 8 | -------------------------------------------------------------------------------- /examples/python-deps/docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: "3" 2 | services: 3 | master: 4 | container_name: spark-master 5 | image: spark-extension-example-docker 6 | command: /opt/spark/bin/spark-class org.apache.spark.deploy.master.Master -h master 7 | environment: 8 | MASTER: spark://master:7077 9 | SPARK_PUBLIC_DNS: localhost 10 | SPARK_MASTER_WEBUI_PORT: 8080 11 | PYSPARK_PYTHON: python${PYTHON_VERSION:-3.8} 12 | PYSPARK_DRIVER_PYTHON: python${PYTHON_VERSION:-3.8} 13 | expose: 14 | - 7077 15 | ports: 16 | - 4040:4040 17 | - 8080:8080 18 | volumes: 19 | - ./:/example 20 | 21 | worker: 22 | container_name: spark-worker 23 | image: spark-extension-example-docker 24 | command: /opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker spark://master:7077 25 | environment: 26 | SPARK_WORKER_CORES: 1 27 | SPARK_WORKER_MEMORY: 1g 28 | SPARK_WORKER_PORT: 8881 29 | SPARK_WORKER_WEBUI_PORT: 8081 30 | SPARK_PUBLIC_DNS: localhost 31 | links: 32 | - master 33 | ports: 34 | - 8081:8081 35 | 36 | -------------------------------------------------------------------------------- /examples/python-deps/example.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | def main(): 4 | spark = SparkSession.builder.appName("spark_app").getOrCreate() 5 | 6 | def func(df): 7 | return df 8 | 9 | from gresearch.spark import install_pip_package 10 | 11 | spark.install_pip_package("pandas", "pyarrow") 12 | spark.range(0, 3, 1, 5).mapInPandas(func, "id long").show() 13 | 14 | if __name__ == "__main__": 15 | main() 16 | -------------------------------------------------------------------------------- /python/gresearch/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /python/gresearch/spark/diff/comparator/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import abc 16 | import dataclasses 17 | from dataclasses import dataclass 18 | 19 | from py4j.java_gateway import JVMView, JavaObject 20 | 21 | from pyspark.sql import Column 22 | from pyspark.sql.functions import abs, greatest, lit 23 | from pyspark.sql.types import DataType 24 | 25 | 26 | class DiffComparator(abc.ABC): 27 | @abc.abstractmethod 28 | def equiv(self, left: Column, right: Column) -> Column: 29 | pass 30 | 31 | 32 | class DiffComparators: 33 | @staticmethod 34 | def default() -> 'DefaultDiffComparator': 35 | return DefaultDiffComparator() 36 | 37 | @staticmethod 38 | def nullSafeEqual() -> 'NullSafeEqualDiffComparator': 39 | return NullSafeEqualDiffComparator() 40 | 41 | @staticmethod 42 | def epsilon(epsilon: float) -> 'EpsilonDiffComparator': 43 | return EpsilonDiffComparator(epsilon) 44 | 45 | @staticmethod 46 | def string(whitespace_agnostic: bool = True) -> 'StringDiffComparator': 47 | return StringDiffComparator(whitespace_agnostic) 48 | 49 | @staticmethod 50 | def duration(duration: str) -> 'DurationDiffComparator': 51 | return DurationDiffComparator(duration) 52 | 53 | @staticmethod 54 | def map(key_type: DataType, value_type: DataType, key_order_sensitive: bool = False) -> 'MapDiffComparator': 55 | return MapDiffComparator(key_type, value_type, key_order_sensitive) 56 | 57 | 58 | class NullSafeEqualDiffComparator(DiffComparator): 59 | def equiv(self, left: Column, right: Column) -> Column: 60 | return left.eqNullSafe(right) 61 | 62 | 63 | class DefaultDiffComparator(NullSafeEqualDiffComparator): 64 | # for testing only 65 | def _to_java(self, jvm: JVMView) -> JavaObject: 66 | return jvm.uk.co.gresearch.spark.diff.DiffComparators.default() 67 | 68 | 69 | @dataclass(frozen=True) 70 | class EpsilonDiffComparator(DiffComparator): 71 | epsilon: float 72 | relative: bool = True 73 | inclusive: bool = True 74 | 75 | def as_relative(self) -> 'EpsilonDiffComparator': 76 | return dataclasses.replace(self, relative=True) 77 | 78 | def as_absolute(self) -> 'EpsilonDiffComparator': 79 | return dataclasses.replace(self, relative=False) 80 | 81 | def as_inclusive(self) -> 'EpsilonDiffComparator': 82 | return dataclasses.replace(self, inclusive=True) 83 | 84 | def as_exclusive(self) -> 'EpsilonDiffComparator': 85 | return dataclasses.replace(self, inclusive=False) 86 | 87 | def equiv(self, left: Column, right: Column) -> Column: 88 | threshold = greatest(abs(left), abs(right)) * self.epsilon if self.relative else lit(self.epsilon) 89 | 90 | def inclusive_epsilon(diff: Column) -> Column: 91 | return diff.__le__(threshold) 92 | 93 | def exclusive_epsilon(diff: Column) -> Column: 94 | return diff.__lt__(threshold) 95 | 96 | in_epsilon = inclusive_epsilon if self.inclusive else exclusive_epsilon 97 | return left.isNull() & right.isNull() | left.isNotNull() & right.isNotNull() & in_epsilon(abs(left - right)) 98 | 99 | 100 | @dataclass(frozen=True) 101 | class StringDiffComparator(DiffComparator): 102 | whitespace_agnostic: bool 103 | 104 | def equiv(self, left: Column, right: Column) -> Column: 105 | return left.eqNullSafe(right) 106 | 107 | 108 | @dataclass(frozen=True) 109 | class DurationDiffComparator(DiffComparator): 110 | duration: str 111 | inclusive: bool = True 112 | 113 | def as_inclusive(self) -> 'DurationDiffComparator': 114 | return dataclasses.replace(self, inclusive=True) 115 | 116 | def as_exclusive(self) -> 'DurationDiffComparator': 117 | return dataclasses.replace(self, inclusive=False) 118 | 119 | def equiv(self, left: Column, right: Column) -> Column: 120 | return left.eqNullSafe(right) 121 | 122 | 123 | @dataclass(frozen=True) 124 | class MapDiffComparator(DiffComparator): 125 | key_type: DataType 126 | value_type: DataType 127 | key_order_sensitive: bool 128 | 129 | def equiv(self, left: Column, right: Column) -> Column: 130 | return left.eqNullSafe(right) 131 | -------------------------------------------------------------------------------- /python/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /python/requirements-3.0_2.12.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.0.3 4 | -------------------------------------------------------------------------------- /python/requirements-3.1_2.12.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.1.3 4 | -------------------------------------------------------------------------------- /python/requirements-3.2_2.12.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.2.2 4 | -------------------------------------------------------------------------------- /python/requirements-3.2_2.13.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.2.2 4 | -------------------------------------------------------------------------------- /python/requirements-3.3_2.12.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.3.1 4 | -------------------------------------------------------------------------------- /python/requirements-3.3_2.13.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.3.1 4 | -------------------------------------------------------------------------------- /python/requirements-3.4_2.12.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.4.0 4 | -------------------------------------------------------------------------------- /python/requirements-3.4_2.13.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.4.0 4 | -------------------------------------------------------------------------------- /python/requirements-3.5_2.12.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.5.0 4 | -------------------------------------------------------------------------------- /python/requirements-3.5_2.13.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==3.5.0 4 | -------------------------------------------------------------------------------- /python/requirements-4.0_2.13.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==4.0.0 4 | -------------------------------------------------------------------------------- /python/requirements-4.1_2.13.txt: -------------------------------------------------------------------------------- 1 | py4j 2 | # keep in-sync with pom.xml 3 | pyspark==4.0.0 4 | -------------------------------------------------------------------------------- /python/setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright 2023 G-Research 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | from pathlib import Path 18 | from setuptools import setup 19 | 20 | jar_version = '2.14.0-3.5-SNAPSHOT' 21 | scala_version = '2.13.8' 22 | scala_compat_version = '.'.join(scala_version.split('.')[:2]) 23 | spark_compat_version = jar_version.split('-')[1] 24 | version = jar_version.replace('SNAPSHOT', 'dev0').replace('-', '.') 25 | 26 | # read the contents of the README.md file 27 | long_description = (Path(__file__).parent / "README.md").read_text() 28 | 29 | setup( 30 | name="pyspark-extension", 31 | version=version, 32 | description="A library that provides useful extensions to Apache Spark.", 33 | long_description=long_description, 34 | long_description_content_type="text/markdown", 35 | author="Enrico Minack", 36 | author_email="github@enrico.minack.dev", 37 | url="https://github.com/G-Research/spark-extension", 38 | tests_require=[f"pyspark~={spark_compat_version}.0", "py4j"], 39 | packages=[ 40 | "gresearch", 41 | "gresearch.spark", 42 | "gresearch.spark.diff", 43 | "gresearch.spark.diff.comparator", 44 | "gresearch.spark.parquet", 45 | "pyspark.jars", 46 | ], 47 | include_package_data=False, 48 | package_data={ 49 | "pyspark.jars": [f"*_{scala_compat_version}-{jar_version}.jar"], 50 | }, 51 | license="http://www.apache.org/licenses/LICENSE-2.0.html", 52 | python_requires=">=3.7", 53 | classifiers=[ 54 | "Development Status :: 5 - Production/Stable", 55 | "License :: OSI Approved :: Apache Software License", 56 | "Programming Language :: Python :: 3", 57 | "Programming Language :: Python :: 3.7", 58 | "Programming Language :: Python :: 3.8", 59 | "Programming Language :: Python :: 3.9", 60 | "Programming Language :: Python :: 3.10", 61 | "Programming Language :: Python :: 3.11", 62 | "Programming Language :: Python :: 3.12", 63 | "Programming Language :: Python :: 3.13", 64 | "Programming Language :: Python :: Implementation :: CPython", 65 | "Programming Language :: Python :: Implementation :: PyPy", 66 | "Typing :: Typed", 67 | ], 68 | ) 69 | -------------------------------------------------------------------------------- /python/test/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | -------------------------------------------------------------------------------- /python/test/requirements.txt: -------------------------------------------------------------------------------- 1 | grpcio>=1.48.1 2 | pandas>=1.0.5 3 | pyarrow>=4.0.0 4 | pytest 5 | unittest-xml-reporting 6 | -------------------------------------------------------------------------------- /python/test/spark_common.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import logging 16 | import os 17 | import sys 18 | import unittest 19 | from contextlib import contextmanager 20 | from pathlib import Path 21 | 22 | from pyspark import SparkConf 23 | from pyspark.sql import SparkSession 24 | 25 | logger = logging.getLogger() 26 | logger.level = logging.INFO 27 | 28 | 29 | @contextmanager 30 | def spark_session(): 31 | session = SparkTest.get_spark_session() 32 | try: 33 | yield session 34 | finally: 35 | session.stop() 36 | 37 | 38 | class SparkTest(unittest.TestCase): 39 | 40 | @staticmethod 41 | def main(file: str): 42 | if len(sys.argv) == 2: 43 | # location to store test results provided, this requires package unittest-xml-reporting 44 | import xmlrunner 45 | 46 | unittest.main( 47 | module=f'test.{Path(file).name[:-3]}', 48 | testRunner=xmlrunner.XMLTestRunner(output=sys.argv[1]), 49 | argv=sys.argv[:1], 50 | # these make sure that some options that are not applicable 51 | # remain hidden from the help menu. 52 | failfast=False, buffer=False, catchbreak=False 53 | ) 54 | else: 55 | unittest.main() 56 | 57 | @staticmethod 58 | def get_pom_path() -> str: 59 | paths = ['.', '..', os.path.join('..', '..')] 60 | for path in paths: 61 | if os.path.exists(os.path.join(path, 'pom.xml')): 62 | return path 63 | raise RuntimeError('Could not find path to pom.xml, looked here: {}'.format(', '.join(paths))) 64 | 65 | @staticmethod 66 | def get_spark_config(path) -> SparkConf: 67 | master = 'local[2]' 68 | conf = SparkConf().setAppName('unit test').setMaster(master) 69 | return conf.setAll([ 70 | ('spark.ui.showConsoleProgress', 'false'), 71 | ('spark.test.home', os.environ.get('SPARK_HOME')), 72 | ('spark.locality.wait', '0'), 73 | ('spark.driver.extraClassPath', '{}'.format(':'.join([ 74 | os.path.join(os.getcwd(), path, 'target', 'classes'), 75 | os.path.join(os.getcwd(), path, 'target', 'test-classes'), 76 | ]))), 77 | ]) 78 | 79 | @classmethod 80 | def get_spark_session(cls) -> SparkSession: 81 | builder = SparkSession.builder 82 | 83 | if 'TEST_SPARK_CONNECT_SERVER' in os.environ: 84 | builder.remote(os.environ['TEST_SPARK_CONNECT_SERVER']) 85 | elif 'PYSPARK_GATEWAY_PORT' in os.environ: 86 | logging.info('Running inside existing Spark environment') 87 | else: 88 | logging.info('Setting up Spark environment') 89 | path = cls.get_pom_path() 90 | conf = cls.get_spark_config(path) 91 | builder.config(conf=conf) 92 | 93 | return builder.getOrCreate() 94 | 95 | spark: SparkSession = None 96 | is_spark_connect: bool = 'TEST_SPARK_CONNECT_SERVER' in os.environ 97 | 98 | @classmethod 99 | def setUpClass(cls): 100 | super(SparkTest, cls).setUpClass() 101 | logging.info('launching Spark session') 102 | cls.spark = cls.get_spark_session() 103 | 104 | @classmethod 105 | def tearDownClass(cls): 106 | logging.info('stopping Spark session') 107 | cls.spark.stop() 108 | super(SparkTest, cls).tearDownClass() 109 | 110 | @contextmanager 111 | def sql_conf(self, pairs): 112 | """ 113 | Copied from pyspark/testing/sqlutils available from PySpark 3.5.0 and higher. 114 | https://github.com/apache/spark/blob/v3.5.0/python/pyspark/testing/sqlutils.py#L171 115 | http://www.apache.org/licenses/LICENSE-2.0 116 | 117 | A convenient context manager to test some configuration specific logic. This sets 118 | `value` to the configuration `key` and then restores it back when it exits. 119 | """ 120 | assert isinstance(pairs, dict), "pairs should be a dictionary." 121 | assert hasattr(self, "spark"), "it should have 'spark' attribute, having a spark session." 122 | 123 | keys = pairs.keys() 124 | new_values = pairs.values() 125 | old_values = [self.spark.conf.get(key, None) for key in keys] 126 | for key, new_value in zip(keys, new_values): 127 | self.spark.conf.set(key, new_value) 128 | try: 129 | yield 130 | finally: 131 | for key, old_value in zip(keys, old_values): 132 | if old_value is None: 133 | self.spark.conf.unset(key) 134 | else: 135 | self.spark.conf.set(key, old_value) 136 | -------------------------------------------------------------------------------- /python/test/test_histogram.py: -------------------------------------------------------------------------------- 1 | # Copyright 2020 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import skipIf 16 | 17 | from spark_common import SparkTest 18 | import gresearch.spark 19 | 20 | 21 | @skipIf(SparkTest.is_spark_connect, "Spark Connect does not provide access to the JVM, required by Historgam") 22 | class HistogramTest(SparkTest): 23 | 24 | @classmethod 25 | def setUpClass(cls): 26 | super(HistogramTest, cls).setUpClass() 27 | 28 | cls.df = cls.spark.createDataFrame([ 29 | (1, 1), 30 | (1, 2), 31 | (1, 10), 32 | (2, -3), 33 | (2, 5), 34 | (3, 8), 35 | ], ['id', 'value']) 36 | 37 | def test_histogram_with_ints(self): 38 | hist = self.df.histogram([-5, 0, 5], 'value', 'id').orderBy('id').collect() 39 | self.assertEqual([ 40 | {'id': 1, '≤-5': 0, '≤0': 0, '≤5': 2, '>5': 1}, 41 | {'id': 2, '≤-5': 0, '≤0': 1, '≤5': 1, '>5': 0}, 42 | {'id': 3, '≤-5': 0, '≤0': 0, '≤5': 0, '>5': 1}, 43 | ], [row.asDict() for row in hist]) 44 | 45 | def test_histogram_with_floats(self): 46 | hist = self.df.histogram([-5.0, 0.0, 5.0], 'value', 'id').orderBy('id').collect() 47 | self.assertEqual([ 48 | {'id': 1, '≤-5.0': 0, '≤0.0': 0, '≤5.0': 2, '>5.0': 1}, 49 | {'id': 2, '≤-5.0': 0, '≤0.0': 1, '≤5.0': 1, '>5.0': 0}, 50 | {'id': 3, '≤-5.0': 0, '≤0.0': 0, '≤5.0': 0, '>5.0': 1}, 51 | ], [row.asDict() for row in hist]) 52 | 53 | 54 | if __name__ == '__main__': 55 | SparkTest.main(__file__) 56 | -------------------------------------------------------------------------------- /python/test/test_job_description.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from unittest import skipIf 16 | 17 | from pyspark import TaskContext, SparkContext 18 | from typing import Optional 19 | 20 | from spark_common import SparkTest 21 | from gresearch.spark import job_description, append_job_description 22 | 23 | 24 | @skipIf(SparkTest.is_spark_connect, "Spark Connect does not provide access to the JVM, required by JobDescription") 25 | class JobDescriptionTest(SparkTest): 26 | 27 | def _assert_job_description(self, expected: Optional[str]): 28 | def get_job_description_func(part): 29 | def func(row): 30 | return row.id, part, TaskContext.get().getLocalProperty("spark.job.description") 31 | return func 32 | 33 | descriptions = self.spark.range(3, numPartitions=3).rdd \ 34 | .mapPartitionsWithIndex(lambda part, it: map(get_job_description_func(part), it)) \ 35 | .collect() 36 | self.assertEqual( 37 | [(0, 0, expected), (1, 1, expected), (2, 2, expected)], 38 | descriptions 39 | ) 40 | 41 | def setUp(self) -> None: 42 | SparkContext._active_spark_context.setJobDescription(None) 43 | 44 | def test_with_job_description(self): 45 | self._assert_job_description(None) 46 | with job_description("job description"): 47 | self._assert_job_description("job description") 48 | with job_description("inner job description"): 49 | self._assert_job_description("inner job description") 50 | self._assert_job_description("job description") 51 | with job_description("inner job description", True): 52 | self._assert_job_description("job description") 53 | self._assert_job_description("job description") 54 | self._assert_job_description(None) 55 | with job_description("other job description", True): 56 | self._assert_job_description("other job description") 57 | self._assert_job_description(None) 58 | 59 | def test_append_job_description(self): 60 | self._assert_job_description(None) 61 | with append_job_description("job"): 62 | self._assert_job_description("job") 63 | with append_job_description("description"): 64 | self._assert_job_description("job - description") 65 | self._assert_job_description("job") 66 | with append_job_description("description 2", " "): 67 | self._assert_job_description("job description 2") 68 | self._assert_job_description("job") 69 | self._assert_job_description(None) 70 | 71 | 72 | if __name__ == '__main__': 73 | SparkTest.main(__file__) 74 | -------------------------------------------------------------------------------- /python/test/test_parquet.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 G-Research 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from pathlib import Path 16 | from unittest import skipIf 17 | 18 | from spark_common import SparkTest 19 | import gresearch.spark.parquet 20 | 21 | 22 | @skipIf(SparkTest.is_spark_connect, "Spark Connect does not provide access to the JVM, required by Parquet") 23 | class ParquetTest(SparkTest): 24 | 25 | test_file = str((Path(__file__).parent.parent.parent / "src" / "test" / "files" / "test.parquet").resolve()) 26 | 27 | def test_parquet_metadata(self): 28 | self.assertEqual(self.spark.read.parquet_metadata(self.test_file).count(), 2) 29 | self.assertEqual(self.spark.read.parquet_metadata(self.test_file, self.test_file).count(), 2) 30 | self.assertEqual(self.spark.read.parquet_metadata(self.test_file, parallelism=100).count(), 2) 31 | self.assertEqual(self.spark.read.parquet_metadata(self.test_file, self.test_file, parallelism=100).count(), 2) 32 | 33 | def test_parquet_schema(self): 34 | self.assertEqual(self.spark.read.parquet_schema(self.test_file).count(), 4) 35 | self.assertEqual(self.spark.read.parquet_schema(self.test_file, self.test_file).count(), 4) 36 | self.assertEqual(self.spark.read.parquet_schema(self.test_file, parallelism=100).count(), 4) 37 | self.assertEqual(self.spark.read.parquet_schema(self.test_file, self.test_file, parallelism=100).count(), 4) 38 | 39 | def test_parquet_blocks(self): 40 | self.assertEqual(self.spark.read.parquet_blocks(self.test_file).count(), 3) 41 | self.assertEqual(self.spark.read.parquet_blocks(self.test_file, self.test_file).count(), 3) 42 | self.assertEqual(self.spark.read.parquet_blocks(self.test_file, parallelism=100).count(), 3) 43 | self.assertEqual(self.spark.read.parquet_blocks(self.test_file, self.test_file, parallelism=100).count(), 3) 44 | 45 | def test_parquet_block_columns(self): 46 | self.assertEqual(self.spark.read.parquet_block_columns(self.test_file).count(), 6) 47 | self.assertEqual(self.spark.read.parquet_block_columns(self.test_file, self.test_file).count(), 6) 48 | self.assertEqual(self.spark.read.parquet_block_columns(self.test_file, parallelism=100).count(), 6) 49 | self.assertEqual(self.spark.read.parquet_block_columns(self.test_file, self.test_file, parallelism=100).count(), 6) 50 | 51 | def test_parquet_partitions(self): 52 | self.assertEqual(self.spark.read.parquet_partitions(self.test_file).count(), 2) 53 | self.assertEqual(self.spark.read.parquet_partitions(self.test_file, self.test_file).count(), 2) 54 | self.assertEqual(self.spark.read.parquet_partitions(self.test_file, parallelism=100).count(), 2) 55 | self.assertEqual(self.spark.read.parquet_partitions(self.test_file, self.test_file, parallelism=100).count(), 2) 56 | 57 | 58 | if __name__ == '__main__': 59 | SparkTest.main(__file__) 60 | -------------------------------------------------------------------------------- /set-version.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ $# -eq 1 ] 4 | then 5 | IFS=- 6 | read version flavour <<< "$1" 7 | 8 | echo "setting version=$version${flavour:+ with }$flavour" 9 | 10 | sed -i -E \ 11 | -e "s%^( )[^-]+-([^-]+).*()$%\1$version-\2${flavour:+-}$flavour\3%" \ 12 | pom.xml 13 | 14 | version=$(grep -m 1 version pom.xml | sed "s/\s*<[^>]*>\s*//g") 15 | 16 | sed -i -E \ 17 | -e "s/(jar_version *= *).*/\1'$version'/" \ 18 | python/setup.py 19 | elif [ $# -eq 2 ] 20 | then 21 | spark=$1 22 | scala=$2 23 | 24 | spark_compat=${spark%.*} 25 | scala_compat=${scala%.*} 26 | 27 | spark_major=${spark_compat%.*} 28 | scala_major=${scala_compat%.*} 29 | 30 | spark_minor=${spark_compat/*./} 31 | scala_minor=${scala_compat/*./} 32 | 33 | spark_patch=${spark/*./} 34 | scala_patch=${scala/*./} 35 | 36 | echo "setting spark=$spark and scala=$scala" 37 | sed -i -E \ 38 | -e "s%^( )([^_]+)[_0-9.]+()$%\1\2_${scala_compat}\3%" \ 39 | -e "s%^( )([^-]+)-[^-]+(.*)$%\1\2-$spark_compat\3%" \ 40 | -e "s%^( ).+()$%\1${scala_major}\2%" \ 41 | -e "s%^( ).+()$%\1${scala_minor}\2%" \ 42 | -e "s%^( ).+()$%\1${scala_patch}\2%" \ 43 | -e "s%^( ).+()$%\1${spark_major}\2%" \ 44 | -e "s%^( ).+()$%\1${spark_minor}\2%" \ 45 | -e "s%^( ).+()$%\1${spark_patch}\2%" \ 46 | pom.xml 47 | 48 | version=$(grep -m 1 version pom.xml | sed "s/\s*<[^>]*>\s*//g") 49 | 50 | sed -i -E \ 51 | -e "s/(jar_version *= *).*/\1'$version'/" \ 52 | -e "s/(scala_version *= *).*/\1'$scala'/" \ 53 | python/setup.py 54 | else 55 | echo "Provide the Spark-Extension version (e.g. 2.5.0 or 2.5.0-SNAPSHOT), or the Spark and Scala version" 56 | exit 1 57 | fi 58 | 59 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/org/apache/spark/sql/extension/package.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.sql.catalyst.expressions.Expression 20 | 21 | object UnixMicros { 22 | def unixMicros(child: Expression): Expression = { 23 | throw new NotImplementedError() 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.1/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.1/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.0/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/SplitFile.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/org/apache/spark/sql/extension/package.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.sql.catalyst.trees.TreeNode 20 | 21 | /** 22 | * Spark version specific trait that back-ports BinaryLike[T].withNewChildrenInternal(T, T) 23 | * to Spark 3.0 and 3.1. This is empty in Spark 3.2 and beyond. 24 | */ 25 | trait BinaryLikeWithNewChildrenInternal[T <: TreeNode[T]] { 26 | self: TreeNode[T] => 27 | 28 | /** 29 | * Method `withNewChildrenInternal` is required for Spark 3.2 and beyond. 30 | * Before, `withNewChildren` is called by Spark, which uses `makeCopy`, which 31 | * "Must be overridden by child classes that have constructor arguments 32 | * that are not present in the productIterator.", 33 | * which is not true for where BinaryLikeWithNewChildrenInternal is used here. 34 | * So nothing need to be overridden. 35 | */ 36 | protected def withNewChildrenInternal(newLeft: T, newRight: T): T 37 | } -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.sql.catalyst.expressions 20 | import org.apache.spark.sql.catalyst.expressions.Expression 21 | 22 | object UnixMicros { 23 | def unixMicros(child: Expression): Expression = expressions.UnixMicros(child) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.parquet.hadoop.metadata.BlockMetaData 20 | 21 | object BlockMetaDataUtil { 22 | def getOrdinal(block: BlockMetaData): Option[Int] = None 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.parquet.schema.PrimitiveType 20 | 21 | object PrimitiveTypeUtil { 22 | def getLogicalTypeAnnotation(primitive: PrimitiveType): Option[String] = None 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.1/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/SplitFile.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/org/apache/spark/sql/extension/package.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | /** 20 | * Spark version specific trait that back-ports BinaryLike[T].withNewChildrenInternal(T, T) 21 | * to Spark 3.0 and 3.1. This is empty in Spark 3.2 and beyond. 22 | */ 23 | trait BinaryLikeWithNewChildrenInternal[T] 24 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.parquet.hadoop.metadata.BlockMetaData 20 | 21 | object BlockMetaDataUtil { 22 | def getOrdinal(block: BlockMetaData): Option[Int] = Some(block.getOrdinal) 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.parquet.schema.PrimitiveType 20 | 21 | object PrimitiveTypeUtil { 22 | def getLogicalTypeAnnotation(primitive: PrimitiveType): Option[String] = 23 | Option(primitive.getLogicalTypeAnnotation).map(_.toString) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.2/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.spark.sql.execution.datasources.PartitionedFile 20 | 21 | case class SplitFile(filePath: String, start: Long, length: Long, fileSize: Option[Long]) 22 | 23 | object SplitFile { 24 | def apply(file: PartitionedFile): SplitFile = SplitFile(file.filePath, file.start, file.length, None) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/org/apache/spark/sql/extension/package.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.3/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.spark.sql.execution.datasources.PartitionedFile 20 | 21 | case class SplitFile(filePath: String, start: Long, length: Long, fileSize: Option[Long]) 22 | 23 | object SplitFile { 24 | def apply(file: PartitionedFile): SplitFile = SplitFile(file.filePath, file.start, file.length, Some(file.fileSize)) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/org/apache/spark/sql/extension/package.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.5/org/apache/spark/sql/extension/package.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.parquet.hadoop.metadata.FileMetaData 20 | 21 | object FileMetaDataUtil { 22 | def getEncryptionType(fileMetaData: FileMetaData): Option[String] = None 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.4/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-4.0/uk/co/gresearch/spark/parquet/SplitFile.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/org/apache/spark/sql/extension/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import org.apache.spark.sql.catalyst.expressions.Expression 20 | 21 | package object extension { 22 | implicit class ColumnExtension(col: Column) { 23 | // Column.expr exists in this Spark version 24 | def sql: String = col.expr.sql 25 | } 26 | 27 | implicit class ExpressionExtension(expr: Expression) { 28 | def column: Column = new Column(expr) 29 | } 30 | } 31 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2021 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import java.util.regex.Pattern 20 | 21 | object Backticks { 22 | 23 | // https://github.com/apache/spark/blob/523ff15/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/QuotingUtils.scala#L46 24 | private val validIdentPattern = Pattern.compile("^[a-zA-Z_][a-zA-Z0-9_]*") 25 | 26 | /** 27 | * Detects if column name part requires quoting. 28 | * https://github.com/apache/spark/blob/523ff15/sql/api/src/main/scala/org/apache/spark/sql/catalyst/util/QuotingUtils.scala#L48 29 | */ 30 | private def needQuote(part: String): Boolean = { 31 | !validIdentPattern.matcher(part).matches() 32 | } 33 | 34 | /** 35 | * Encloses the given strings with backticks (backquotes) if needed. 36 | * 37 | * Backticks are not needed for strings that start with a letter (`a`-`z` and `A`-`Z`) or an underscore, 38 | * and contain only letters, numbers and underscores. 39 | * 40 | * Multiple strings will be enclosed individually and concatenated with dots (`.`). 41 | * 42 | * This is useful when referencing column names that contain special characters like dots (`.`) or backquotes. 43 | * 44 | * Examples: 45 | * {{{ 46 | * col("a.column") // this references the field "column" of column "a" 47 | * col("`a.column`") // this reference the column with the name "a.column" 48 | * col(Backticks.column_name("column")) // produces "column" 49 | * col(Backticks.column_name("a.column")) // produces "`a.column`" 50 | * col(Backticks.column_name("a column")) // produces "`a column`" 51 | * col(Backticks.column_name("`a.column`")) // produces "`a.column`" 52 | * col(Backticks.column_name("a.column", "a.field")) // produces "`a.column`.`a.field`" 53 | * }}} 54 | * 55 | * @param string 56 | * a string 57 | * @param strings 58 | * more strings 59 | * @return 60 | */ 61 | @scala.annotation.varargs 62 | def column_name(string: String, strings: String*): String = 63 | (string +: strings) 64 | .map(s => if (needQuote(s)) s"`${s.replace("`", "``")}`" else s) 65 | .mkString(".") 66 | 67 | } 68 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.parquet.hadoop.metadata.FileMetaData 20 | 21 | object FileMetaDataUtil { 22 | def getEncryptionType(fileMetaData: FileMetaData): Option[String] = 23 | Some(fileMetaData.getEncryptionType.name()) 24 | } 25 | -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-3.5/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-4.0/uk/co/gresearch/spark/parquet/SplitFile.scala -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/org/apache/spark/sql/extension/extension.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2024 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package org.apache.spark.sql 18 | 19 | import org.apache.spark.sql.catalyst.expressions.Expression 20 | import org.apache.spark.sql.classic.ExpressionUtils.{column => toColumn, expression} 21 | 22 | package object extension { 23 | implicit class ColumnExtension(col: Column) { 24 | def expr: Expression = expression(col) 25 | def sql: String = col.node.sql 26 | } 27 | 28 | implicit class ExpressionExtension(expr: Expression) { 29 | def column: Column = toColumn(expr) 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/Backticks.scala: -------------------------------------------------------------------------------- 1 | package uk.co.gresearch.spark 2 | 3 | import org.apache.spark.sql.catalyst.util.QuotingUtils 4 | 5 | /* 6 | * Copyright 2021 G-Research 7 | * 8 | * Licensed under the Apache License, Version 2.0 (the "License"); 9 | * you may not use this file except in compliance with the License. 10 | * You may obtain a copy of the License at 11 | * 12 | * http://www.apache.org/licenses/LICENSE-2.0 13 | * 14 | * Unless required by applicable law or agreed to in writing, software 15 | * distributed under the License is distributed on an "AS IS" BASIS, 16 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 17 | * See the License for the specific language governing permissions and 18 | * limitations under the License. 19 | */ 20 | 21 | object Backticks { 22 | 23 | /** 24 | * Encloses the given strings with backticks (backquotes) if needed. 25 | * 26 | * Backticks are not needed for strings that start with a letter (`a`-`z` and `A`-`Z`) or an underscore, 27 | * and contain only letters, numbers and underscores. 28 | * 29 | * Multiple strings will be enclosed individually and concatenated with dots (`.`). 30 | * 31 | * This is useful when referencing column names that contain special characters like dots (`.`) or backquotes. 32 | * 33 | * Examples: 34 | * {{{ 35 | * col("a.column") // this references the field "column" of column "a" 36 | * col("`a.column`") // this reference the column with the name "a.column" 37 | * col(Backticks.column_name("column")) // produces "column" 38 | * col(Backticks.column_name("a.column")) // produces "`a.column`" 39 | * col(Backticks.column_name("a column")) // produces "`a column`" 40 | * col(Backticks.column_name("`a.column`")) // produces "`a.column`" 41 | * col(Backticks.column_name("a.column", "a.field")) // produces "`a.column`.`a.field`" 42 | * }}} 43 | * 44 | * @param string 45 | * a string 46 | * @param strings 47 | * more strings 48 | * @return 49 | */ 50 | @scala.annotation.varargs 51 | def column_name(string: String, strings: String*): String = 52 | QuotingUtils.quoted(Array.from(string +: strings)) 53 | 54 | } 55 | -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.2/uk/co/gresearch/spark/BinaryLikeWithNewChildrenInternal.scala -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/UnixMicros.scala: -------------------------------------------------------------------------------- 1 | ../../../../../scala-spark-3.1/uk/co/gresearch/spark/UnixMicros.scala -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/BlockMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.5/uk/co/gresearch/spark/parquet/FileMetaDataUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala: -------------------------------------------------------------------------------- 1 | ../../../../../../scala-spark-3.2/uk/co/gresearch/spark/parquet/PrimitiveTypeUtil.scala -------------------------------------------------------------------------------- /src/main/scala-spark-4.0/uk/co/gresearch/spark/parquet/SplitFile.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.parquet 18 | 19 | import org.apache.spark.sql.execution.datasources.PartitionedFile 20 | 21 | case class SplitFile(filePath: String, start: Long, length: Long, fileSize: Option[Long]) 22 | 23 | object SplitFile { 24 | def apply(file: PartitionedFile): SplitFile = SplitFile(file.filePath.toString, file.start, file.length, Some(file.fileSize)) 25 | } 26 | -------------------------------------------------------------------------------- /src/main/scala-spark-4.1: -------------------------------------------------------------------------------- 1 | scala-spark-4.0 -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co 18 | 19 | package object gresearch { 20 | 21 | trait ConditionalCall[T] { 22 | def call(f: T => T): T 23 | def either[R](f: T => R): ConditionalCallOr[T, R] 24 | } 25 | 26 | trait ConditionalCallOr[T, R] { 27 | def or(f: T => R): R 28 | } 29 | 30 | case class TrueCall[T](t: T) extends ConditionalCall[T] { 31 | override def call(f: T => T): T = f(t) 32 | override def either[R](f: T => R): ConditionalCallOr[T, R] = TrueCallOr[T, R](f(t)) 33 | } 34 | 35 | case class FalseCall[T](t: T) extends ConditionalCall[T] { 36 | override def call(f: T => T): T = t 37 | override def either[R](f: T => R): ConditionalCallOr[T, R] = FalseCallOr[T, R](t) 38 | } 39 | 40 | case class TrueCallOr[T, R](r: R) extends ConditionalCallOr[T, R] { 41 | override def or(f: T => R): R = r 42 | } 43 | 44 | case class FalseCallOr[T, R](t: T) extends ConditionalCallOr[T, R] { 45 | override def or(f: T => R): R = f(t) 46 | } 47 | 48 | implicit class ExtendedAny[T](t: T) { 49 | 50 | /** 51 | * Allows to call a function on the decorated instance conditionally. 52 | * 53 | * This allows fluent code like 54 | * 55 | * {{{ 56 | * i.doThis() 57 | * .doThat() 58 | * .on(condition).call(function) 59 | * .on(condition).either(function1).or(function2) 60 | * .doMore() 61 | * }}} 62 | * 63 | * rather than 64 | * 65 | * {{{ 66 | * val temp = i.doThis() 67 | * .doThat() 68 | * val temp2 = if (condition) function(temp) else temp 69 | * temp2.doMore() 70 | * }}} 71 | * 72 | * which either needs many temporary variables or duplicate code. 73 | * 74 | * @param condition 75 | * condition 76 | * @return 77 | * the function result 78 | */ 79 | def on(condition: Boolean): ConditionalCall[T] = { 80 | if (condition) TrueCall[T](t) else FalseCall[T](t) 81 | } 82 | 83 | /** 84 | * Allows to call a function on the decorated instance conditionally. This is an alias for the `on` function. 85 | * 86 | * This allows fluent code like 87 | * 88 | * {{{ 89 | * i.doThis() 90 | * .doThat() 91 | * .when(condition).call(function) 92 | * .when(condition).either(function1).or(function2) 93 | * .doMore() 94 | * 95 | * 96 | * rather than 97 | * 98 | * {{{ 99 | * val temp = i.doThis() 100 | * .doThat() 101 | * val temp2 = if (condition) function(temp) else temp 102 | * temp2.doMore() 103 | * }}} 104 | * 105 | * which either needs many temporary variables or duplicate code. 106 | * 107 | * @param condition 108 | * condition 109 | * @return 110 | * the function result 111 | */ 112 | def when(condition: Boolean): ConditionalCall[T] = on(condition) 113 | 114 | /** 115 | * Executes the given function on the decorated instance. 116 | * 117 | * This allows writing fluent code like 118 | * 119 | * {{{ 120 | * i.doThis() 121 | * .doThat() 122 | * .call(function) 123 | * .doMore() 124 | * }}} 125 | * 126 | * rather than 127 | * 128 | * {{{ 129 | * function( 130 | * i.doThis() 131 | * .doThat() 132 | * ).doMore() 133 | * }}} 134 | * 135 | * where the effective sequence of operations is not clear. 136 | * 137 | * @param f 138 | * function 139 | * @return 140 | * the function result 141 | */ 142 | def call[R](f: T => R): R = f(t) 143 | } 144 | 145 | } 146 | -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/spark/BuildVersion.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import java.util.Properties 20 | 21 | /** 22 | * Provides versions from build environment. 23 | */ 24 | trait BuildVersion { 25 | val propertyFileName = "spark-extension-build.properties" 26 | 27 | lazy val props: Properties = { 28 | val properties = new Properties 29 | 30 | val in = Option(Thread.currentThread().getContextClassLoader.getResourceAsStream(propertyFileName)) 31 | if (in.isEmpty) { 32 | throw new RuntimeException(s"Property file $propertyFileName not found in class path") 33 | } 34 | 35 | in.foreach(properties.load) 36 | properties 37 | } 38 | 39 | lazy val VersionString: String = props.getProperty("project.version") 40 | 41 | lazy val BuildSparkMajorVersion: Int = props.getProperty("spark.major.version").toInt 42 | lazy val BuildSparkMinorVersion: Int = props.getProperty("spark.minor.version").toInt 43 | lazy val BuildSparkPatchVersion: Int = props.getProperty("spark.patch.version").split("-").head.toInt 44 | lazy val BuildSparkCompatVersionString: String = props.getProperty("spark.compat.version") 45 | 46 | lazy val BuildScalaMajorVersion: Int = props.getProperty("scala.major.version").toInt 47 | lazy val BuildScalaMinorVersion: Int = props.getProperty("scala.minor.version").toInt 48 | lazy val BuildScalaPatchVersion: Int = props.getProperty("scala.patch.version").toInt 49 | lazy val BuildScalaCompatVersionString: String = props.getProperty("scala.compat.version") 50 | 51 | val BuildSparkVersion: (Int, Int, Int) = (BuildSparkMajorVersion, BuildSparkMinorVersion, BuildSparkPatchVersion) 52 | val BuildSparkCompatVersion: (Int, Int) = (BuildSparkMajorVersion, BuildSparkMinorVersion) 53 | 54 | val BuildScalaVersion: (Int, Int, Int) = (BuildScalaMajorVersion, BuildScalaMinorVersion, BuildScalaPatchVersion) 55 | val BuildScalaCompatVersion: (Int, Int) = (BuildScalaMajorVersion, BuildScalaMinorVersion) 56 | } 57 | -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/spark/Histogram.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.sql.functions.{sum, when} 20 | import org.apache.spark.sql.{Column, DataFrame, Dataset} 21 | import uk.co.gresearch.ExtendedAny 22 | 23 | import scala.collection.JavaConverters 24 | 25 | object Histogram { 26 | 27 | /** 28 | * Compute the histogram of a column when aggregated by aggregate columns. Thresholds are expected to be provided in 29 | * ascending order. The result dataframe contains the aggregate and histogram columns only. For each threshold value 30 | * in thresholds, there will be a column named s"≤threshold". There will also be a final column called 31 | * s">last_threshold", that counts the remaining values that exceed the last threshold. 32 | * 33 | * @param df 34 | * dataset to compute histogram from 35 | * @param thresholds 36 | * sequence of thresholds in ascending order, must implement <= and > operators w.r.t. valueColumn 37 | * @param valueColumn 38 | * histogram is computed for values of this column 39 | * @param aggregateColumns 40 | * histogram is computed against these columns 41 | * @tparam T 42 | * type of histogram thresholds 43 | * @return 44 | * dataframe with aggregate and histogram columns 45 | */ 46 | def of[D, T](df: Dataset[D], thresholds: Seq[T], valueColumn: Column, aggregateColumns: Column*): DataFrame = { 47 | if (thresholds.isEmpty) 48 | throw new IllegalArgumentException("Thresholds must not be empty") 49 | 50 | val bins = if (thresholds.length == 1) Seq.empty else thresholds.sliding(2).toSeq 51 | 52 | if (bins.exists(s => s.head == s.last)) 53 | throw new IllegalArgumentException(s"Thresholds must not contain duplicates: ${thresholds.mkString(",")}") 54 | 55 | df.toDF() 56 | .withColumn(s"≤${thresholds.head}", when(valueColumn <= thresholds.head, 1).otherwise(0)) 57 | .call(bins.foldLeft(_) { case (df, bin) => 58 | df.withColumn(s"≤${bin.last}", when(valueColumn > bin.head && valueColumn <= bin.last, 1).otherwise(0)) 59 | }) 60 | .withColumn(s">${thresholds.last}", when(valueColumn > thresholds.last, 1).otherwise(0)) 61 | .groupBy(aggregateColumns: _*) 62 | .agg( 63 | Some(thresholds.head).map(t => sum(backticks(s"≤$t")).as(s"≤$t")).get, 64 | thresholds.tail.map(t => sum(backticks(s"≤$t")).as(s"≤$t")) :+ 65 | sum(backticks(s">${thresholds.last}")).as(s">${thresholds.last}"): _* 66 | ) 67 | } 68 | 69 | /** 70 | * Compute the histogram of a column when aggregated by aggregate columns. Thresholds are expected to be provided in 71 | * ascending order. The result dataframe contains the aggregate and histogram columns only. For each threshold value 72 | * in thresholds, there will be a column named s"≤threshold". There will also be a final column called 73 | * s">last_threshold", that counts the remaining values that exceed the last threshold. 74 | * 75 | * @param df 76 | * dataset to compute histogram from 77 | * @param thresholds 78 | * sequence of thresholds in ascending order, must implement <= and > operators w.r.t. valueColumn 79 | * @param valueColumn 80 | * histogram is computed for values of this column 81 | * @param aggregateColumns 82 | * histogram is computed against these columns 83 | * @tparam T 84 | * type of histogram thresholds 85 | * @return 86 | * dataframe with aggregate and histogram columns 87 | */ 88 | @scala.annotation.varargs 89 | def of[D, T]( 90 | df: Dataset[D], 91 | thresholds: java.util.List[T], 92 | valueColumn: Column, 93 | aggregateColumns: Column* 94 | ): DataFrame = 95 | of(df, JavaConverters.iterableAsScalaIterable(thresholds).toSeq, valueColumn, aggregateColumns: _*) 96 | 97 | } 98 | -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/spark/RowNumbers.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.sql.expressions.Window 20 | import org.apache.spark.sql.{Column, DataFrame, Dataset, functions} 21 | import org.apache.spark.sql.functions.{coalesce, col, lit, max, monotonically_increasing_id, spark_partition_id, sum} 22 | import org.apache.spark.storage.StorageLevel 23 | 24 | case class RowNumbersFunc( 25 | rowNumberColumnName: String = "row_number", 26 | storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK, 27 | unpersistHandle: UnpersistHandle = UnpersistHandle.Noop, 28 | orderColumns: Seq[Column] = Seq.empty 29 | ) { 30 | 31 | def withRowNumberColumnName(rowNumberColumnName: String): RowNumbersFunc = 32 | this.copy(rowNumberColumnName = rowNumberColumnName) 33 | 34 | def withStorageLevel(storageLevel: StorageLevel): RowNumbersFunc = 35 | this.copy(storageLevel = storageLevel) 36 | 37 | def withUnpersistHandle(unpersistHandle: UnpersistHandle): RowNumbersFunc = 38 | this.copy(unpersistHandle = unpersistHandle) 39 | 40 | def withOrderColumns(orderColumns: Seq[Column]): RowNumbersFunc = 41 | this.copy(orderColumns = orderColumns) 42 | 43 | def of[D](df: Dataset[D]): DataFrame = { 44 | if ( 45 | storageLevel.equals( 46 | StorageLevel.NONE 47 | ) && (SparkMajorVersion > 3 || SparkMajorVersion == 3 && SparkMinorVersion >= 5) 48 | ) { 49 | throw new IllegalArgumentException(s"Storage level $storageLevel not supported with Spark 3.5.0 and above.") 50 | } 51 | 52 | // define some column names that do not exist in ds 53 | val prefix = distinctPrefixFor(df.columns) 54 | val monoIdColumnName = prefix + "mono_id" 55 | val partitionIdColumnName = prefix + "partition_id" 56 | val localRowNumberColumnName = prefix + "local_row_number" 57 | val maxLocalRowNumberColumnName = prefix + "max_local_row_number" 58 | val cumRowNumbersColumnName = prefix + "cum_row_numbers" 59 | val partitionOffsetColumnName = prefix + "partition_offset" 60 | 61 | // if no order is given, we preserve existing order 62 | val dfOrdered = 63 | if (orderColumns.isEmpty) df.withColumn(monoIdColumnName, monotonically_increasing_id()) 64 | else df.orderBy(orderColumns: _*) 65 | val order = if (orderColumns.isEmpty) Seq(col(monoIdColumnName)) else orderColumns 66 | 67 | // add partition ids and local row numbers 68 | val localRowNumberWindow = Window.partitionBy(partitionIdColumnName).orderBy(order: _*) 69 | val dfWithPartitionId = dfOrdered 70 | .withColumn(partitionIdColumnName, spark_partition_id()) 71 | .persist(storageLevel) 72 | unpersistHandle.setDataFrame(dfWithPartitionId) 73 | val dfWithLocalRowNumbers = dfWithPartitionId 74 | .withColumn(localRowNumberColumnName, functions.row_number().over(localRowNumberWindow)) 75 | 76 | // compute row offset for the partitions 77 | val cumRowNumbersWindow = Window 78 | .orderBy(partitionIdColumnName) 79 | .rowsBetween(Window.unboundedPreceding, Window.currentRow) 80 | val partitionOffsets = dfWithLocalRowNumbers 81 | .groupBy(partitionIdColumnName) 82 | .agg(max(localRowNumberColumnName).alias(maxLocalRowNumberColumnName)) 83 | .withColumn(cumRowNumbersColumnName, sum(maxLocalRowNumberColumnName).over(cumRowNumbersWindow)) 84 | .select( 85 | col(partitionIdColumnName) + 1 as partitionIdColumnName, 86 | col(cumRowNumbersColumnName).as(partitionOffsetColumnName) 87 | ) 88 | 89 | // compute global row number by adding local row number with partition offset 90 | val partitionOffsetColumn = coalesce(col(partitionOffsetColumnName), lit(0)) 91 | dfWithLocalRowNumbers 92 | .join(partitionOffsets, Seq(partitionIdColumnName), "left") 93 | .withColumn(rowNumberColumnName, col(localRowNumberColumnName) + partitionOffsetColumn) 94 | .drop(monoIdColumnName, partitionIdColumnName, localRowNumberColumnName, partitionOffsetColumnName) 95 | } 96 | 97 | } 98 | 99 | object RowNumbers { 100 | def default(): RowNumbersFunc = RowNumbersFunc() 101 | 102 | def withRowNumberColumnName(rowNumberColumnName: String): RowNumbersFunc = 103 | default().withRowNumberColumnName(rowNumberColumnName) 104 | 105 | def withStorageLevel(storageLevel: StorageLevel): RowNumbersFunc = 106 | default().withStorageLevel(storageLevel) 107 | 108 | def withUnpersistHandle(unpersistHandle: UnpersistHandle): RowNumbersFunc = 109 | default().withUnpersistHandle(unpersistHandle) 110 | 111 | @scala.annotation.varargs 112 | def withOrderColumns(orderColumns: Column*): RowNumbersFunc = 113 | default().withOrderColumns(orderColumns) 114 | 115 | def of[D](ds: Dataset[D]): DataFrame = default().of(ds) 116 | } 117 | -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/spark/SparkVersion.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2023 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.SPARK_VERSION_SHORT 20 | 21 | /** 22 | * Provides versions form runtime environment. 23 | */ 24 | trait SparkVersion { 25 | private def SparkVersionSeq: Seq[Int] = SPARK_VERSION_SHORT.split('.').toSeq.map(_.toInt) 26 | 27 | def SparkMajorVersion: Int = SparkVersionSeq.head 28 | def SparkMinorVersion: Int = SparkVersionSeq(1) 29 | def SparkPatchVersion: Int = SparkVersionSeq(2) 30 | 31 | def SparkVersion: (Int, Int, Int) = (SparkMajorVersion, SparkMinorVersion, SparkPatchVersion) 32 | def SparkCompatVersion: (Int, Int) = (SparkMajorVersion, SparkMinorVersion) 33 | def SparkCompatVersionString: String = SparkVersionSeq.slice(0, 2).mkString(".") 34 | } 35 | -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/spark/UnpersistHandle.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark 18 | 19 | import org.apache.spark.sql.DataFrame 20 | 21 | /** 22 | * Handle to call `DataFrame.unpersist` on a `DataFrame` that is not known to the caller. The [[RowNumbers.of]] 23 | * constructs a `DataFrame` that is based ony an intermediate cached `DataFrame`, for witch `unpersist` must be called. 24 | * A provided [[UnpersistHandle]] allows to do that in user code. 25 | */ 26 | class UnpersistHandle { 27 | var df: Option[DataFrame] = None 28 | 29 | private[spark] def setDataFrame(dataframe: DataFrame): DataFrame = { 30 | if (df.isDefined) throw new IllegalStateException("DataFrame has been set already, it cannot be reused.") 31 | this.df = Some(dataframe) 32 | dataframe 33 | } 34 | 35 | def apply(): Unit = { 36 | this.df.getOrElse(throw new IllegalStateException("DataFrame has to be set first")).unpersist() 37 | } 38 | 39 | def apply(blocking: Boolean): Unit = { 40 | this.df.getOrElse(throw new IllegalStateException("DataFrame has to be set first")).unpersist(blocking) 41 | } 42 | } 43 | 44 | case class SilentUnpersistHandle() extends UnpersistHandle { 45 | override def apply(): Unit = { 46 | this.df.foreach(_.unpersist()) 47 | } 48 | 49 | override def apply(blocking: Boolean): Unit = { 50 | this.df.foreach(_.unpersist(blocking)) 51 | } 52 | } 53 | 54 | case class NoopUnpersistHandle() extends UnpersistHandle { 55 | override def setDataFrame(dataframe: DataFrame): DataFrame = dataframe 56 | override def apply(): Unit = {} 57 | override def apply(blocking: Boolean): Unit = {} 58 | } 59 | 60 | object UnpersistHandle { 61 | val Noop: NoopUnpersistHandle = NoopUnpersistHandle() 62 | def apply(): UnpersistHandle = new UnpersistHandle() 63 | 64 | def withUnpersist[T](blocking: Boolean = false)(func: UnpersistHandle => T): T = { 65 | val handle = SilentUnpersistHandle() 66 | try { 67 | func(handle) 68 | } finally { 69 | handle(blocking) 70 | } 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /src/main/scala/uk/co/gresearch/spark/diff/DiffComparators.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 G-Research 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package uk.co.gresearch.spark.diff 18 | 19 | import org.apache.spark.sql.Encoder 20 | import org.apache.spark.sql.types.DataType 21 | import uk.co.gresearch.spark.diff.comparator._ 22 | 23 | import java.time.Duration 24 | 25 | object DiffComparators { 26 | 27 | /** 28 | * The default comparator used in [[DiffOptions.default.defaultComparator]]. 29 | */ 30 | def default(): DiffComparator = DefaultDiffComparator 31 | 32 | /** 33 | * A comparator equivalent to `Column <=> Column`. Null values are considered equal. 34 | */ 35 | def nullSafeEqual(): DiffComparator = NullSafeEqualDiffComparator 36 | 37 | /** 38 | * Return a comparator that uses the given [[math.Equiv]] to compare values of type [[T]]. The implicit [[Encoder]] of 39 | * type [[T]] determines the input data type of the comparator. Only columns of that type can be compared. 40 | */ 41 | def equiv[T: Encoder](equiv: math.Equiv[T]): EquivDiffComparator[T] = EquivDiffComparator(equiv) 42 | 43 | /** 44 | * Return a comparator that uses the given [[math.Equiv]] to compare values of type [[T]]. Only columns of the given 45 | * data type `inputType` can be compared. 46 | */ 47 | def equiv[T](equiv: math.Equiv[T], inputType: DataType): EquivDiffComparator[T] = 48 | EquivDiffComparator(equiv, inputType) 49 | 50 | /** 51 | * Return a comparator that uses the given [[math.Equiv]] to compare values of any type. 52 | */ 53 | def equiv(equiv: math.Equiv[Any]): EquivDiffComparator[Any] = EquivDiffComparator(equiv) 54 | 55 | /** 56 | * This comparator considers values equal when they are less than `epsilon` apart. It can be configured to use 57 | * `epsilon` as an absolute (`.asAbsolute()`) threshold, or as relative (`.asRelative()`) to the larger value. 58 | * Further, the threshold itself can be considered equal (`.asInclusive()`) or not equal (`.asExclusive()`): 59 | * 60 | * 65 | * 66 | * Requires compared column types to implement `-`, `*`, `<`, `==`, and `abs`. 67 | */ 68 | def epsilon(epsilon: Double): EpsilonDiffComparator = EpsilonDiffComparator(epsilon) 69 | 70 | /** 71 | * A comparator for string values. 72 | * 73 | * With `whitespaceAgnostic` set `true`, differences in white spaces are ignored. This ignores leading and trailing 74 | * whitespaces as well. With `whitespaceAgnostic` set `false`, this is equal to the default string comparison (see 75 | * [[default()]]). 76 | */ 77 | def string(whitespaceAgnostic: Boolean = true): StringDiffComparator = 78 | if (whitespaceAgnostic) { 79 | WhitespaceDiffComparator 80 | } else { 81 | StringDiffComparator 82 | } 83 | 84 | /** 85 | * This comparator considers two `DateType` or `TimestampType` values equal when they are at most `duration` apart. 86 | * Duration is an instance of `java.time.Duration`. 87 | * 88 | * The comparator can be configured to consider `duration` as equal (`.asInclusive()`) or not equal 89 | * (`.asExclusive()`):