├── .git-blame-ignore-revs ├── .github ├── ISSUE_TEMPLATE │ ├── config.yml │ └── generic.yml ├── dependabot.yml └── workflows │ ├── changelog.yaml │ ├── ci.yml │ ├── clean.yml │ ├── dependency-graph.yml │ ├── potential-duplicates.yml │ ├── pr-agent.yaml │ └── rebase.yml ├── .gitignore ├── .mill-jvm-opts ├── .mill-version ├── .scala-steward.conf ├── .scalafmt.conf ├── CHANGELOG.md ├── CONTRIBUTING.md ├── ISSUE_TEMPLATE.md ├── LICENSE ├── README.md ├── build.mill ├── docs ├── README.md ├── azure_synapse.md └── spark_excel_examples.ipynb ├── mill ├── private-key.pem.enc ├── scalastyle-config.xml └── src ├── README.md ├── main ├── 2.4 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ ├── ExcelSparkInternal.scala │ │ └── excel │ │ └── v2 │ │ ├── ExcelDataSource.scala │ │ ├── ExcelDateTimeStringUtils.scala │ │ ├── ExcelFilters.scala │ │ ├── ExcelOptions.scala │ │ ├── ExcelParserBase.scala │ │ ├── FailureSafeParser.scala │ │ └── SchemaUtils.scala ├── 3.0 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelDateTimeStringUtils.scala │ │ └── ExcelFilters.scala ├── 3.0_and_up │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelDataSource.scala │ │ └── ExcelFileFormat.scala ├── 3.0_to_3.1 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelOutputWriter.scala │ │ ├── ExcelTable.scala │ │ └── ExcelWriteBuilder.scala ├── 3.0_to_3.2 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelScan.scala │ │ └── ExcelScanBuilder.scala ├── 3.0_to_3.3 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ └── ExcelOptions.scala ├── 3.0_to_3.4.1 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelParserBase.scala │ │ └── ExcelPartitionReaderFactory.scala ├── 3.1 │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ └── ExcelDateTimeStringUtils.scala ├── 3.1_and_up │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ └── ExcelFilters.scala ├── 3.2_and_up │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelDateTimeStringUtils.scala │ │ ├── ExcelOutputWriter.scala │ │ ├── ExcelTable.scala │ │ └── ExcelWriteBuilder.scala ├── 3.3_and_up │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelScan.scala │ │ └── ExcelScanBuilder.scala ├── 3.4.2_and_up │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ ├── ExcelParserBase.scala │ │ └── ExcelPartitionReaderFactory.scala ├── 3.4_and_up │ └── scala │ │ └── dev │ │ └── mauch │ │ └── spark │ │ └── excel │ │ └── v2 │ │ └── ExcelOptions.scala ├── resources │ └── META-INF │ │ └── services │ │ └── org.apache.spark.sql.sources.DataSourceRegister └── scala │ └── dev │ └── mauch │ └── spark │ └── excel │ ├── DataColumn.scala │ ├── DataLocator.scala │ ├── DefaultSource.scala │ ├── DefaultSource15.scala │ ├── ExcelFileSaver.scala │ ├── ExcelRelation.scala │ ├── InferSchema.scala │ ├── PlainNumberFormat.scala │ ├── Utils.scala │ ├── WorkbookReader.scala │ ├── package.scala │ └── v2 │ ├── DataLocator.scala │ ├── ExcelGenerator.scala │ ├── ExcelHeaderChecker.scala │ ├── ExcelHelper.scala │ ├── ExcelInferSchema.scala │ ├── ExcelOptionsTrait.scala │ ├── ExcelParser.scala │ └── SheetData.scala └── test ├── resources ├── log4j2.properties └── spreadsheets │ ├── Issue_747_plain_number.xlsx │ ├── apache_poi │ ├── 57231_MixedGasReport.xls │ └── DataTableCities.xlsx │ ├── ca_dataset │ └── 2019 │ │ ├── Quarter=1 │ │ └── ca_03.xlsx │ │ ├── Quarter=2 │ │ ├── ca_04.xlsx │ │ ├── ca_05.xlsx │ │ └── ca_06.xlsx │ │ ├── Quarter=3 │ │ ├── ca_07.xlsx │ │ ├── ca_08.xlsx │ │ └── ca_09.xlsx │ │ └── Quarter=4 │ │ ├── ca_10.xlsx │ │ ├── ca_11.xlsx │ │ └── ca_12.xlsx │ ├── infer_stricter_numerical_types.xls │ ├── infer_stricter_numerical_types.xlsx │ ├── issue_162_nihar_gharat.xlsx │ ├── issue_285_bryce21.xlsx │ ├── issue_463_cristichircu.xlsx │ ├── issue_942_sheetname_digits.xlsx │ ├── issue_944_faulty_dimension.md │ ├── issue_944_faulty_dimension.xlsx │ ├── issue_965_blank_rows.md │ ├── issue_965_blank_rows.xlsx │ ├── plain_number.xlsx │ ├── read_multiple_sheets_at_once.xlsx │ ├── read_multiple_sheets_at_once_noheader.xlsx │ ├── simple_encrypted.xls │ ├── simple_encrypted.xlsx │ └── with_errors_all_types.xlsx └── scala └── dev └── mauch ├── spark ├── DataFrameSuiteBase.scala └── excel │ ├── DataLocatorSuite.scala │ ├── EncryptedReadSuite.scala │ ├── ErrorsAsStringsReadSuite.scala │ ├── Generators.scala │ ├── IntegrationSuite.scala │ ├── PlainNumberReadSuite.scala │ ├── RichRowSuite.scala │ └── v2 │ ├── AreaReferenceReadSuite.scala │ ├── DataFrameWriterApiComplianceSuite.scala │ ├── EncryptedReadSuite.scala │ ├── ErrorsAsStringsReadSuite.scala │ ├── ExcelTestingUtilities.scala │ ├── GlobPartitionAndFileNameSuite.scala │ ├── InferStricterNumericalTypesSuite.scala │ ├── KeepUndefinedRowsSuite.scala │ ├── LocalFileTestingUtilities.scala │ ├── ManyPartitionReadSuite.scala │ ├── NumericTypesSuite.scala │ ├── PlainNumberReadSuite.scala │ ├── ProjectionAndFilterPushdownSuite.scala │ ├── RowNumberColumnSuite.scala │ ├── TableReadSuite.scala │ ├── UserReportedIssuesSuite.scala │ └── WriteAndReadSuite.scala └── tags └── package.scala /.git-blame-ignore-revs: -------------------------------------------------------------------------------- 1 | # Scala Steward: Reformat with scalafmt 3.6.1 2 | a834cf94453ed2f3ab1b87818c2fd124fe87fa2a 3 | 4 | # Scala Steward: Reformat with scalafmt 3.7.11 5 | 11269e71a3460ae21f2a96ac8416c0bdd3f1f3b0 6 | 7 | # Scala Steward: Reformat with scalafmt 3.7.15 8 | 17f6ce5807fb3a91938824a285e30f786adea570 9 | 10 | # Scala Steward: Reformat with scalafmt 3.7.17 11 | e4fde8d1e6e34db2d24949275429ce3a7885c2ad 12 | 13 | # Scala Steward: Reformat with scalafmt 3.8.5 14 | 59dd3ea00b8772fd4e8798fde7941c1745ca83f2 15 | 16 | # Scala Steward: Reformat with scalafmt 3.9.5 17 | 19da40630c2645140336554bbce4a48881367bd2 18 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/config.yml: -------------------------------------------------------------------------------- 1 | blank_issues_enabled: false 2 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/generic.yml: -------------------------------------------------------------------------------- 1 | name: 🐞 Bug 2 | description: File a bug/issue 3 | title: "[BUG] " 4 | labels: [Bug, Needs Triage] 5 | body: 6 | - type: checkboxes 7 | attributes: 8 | label: Am I using the newest version of the library? 9 | description: Please always use the latest version before posting any issues. Your bug might already have been solved.. 10 | options: 11 | - label: I have made sure that I'm using the latest version of the library. 12 | required: true 13 | - type: checkboxes 14 | attributes: 15 | label: Is there an existing issue for this? 16 | description: Please search to see if an issue already exists for the bug you encountered. 17 | options: 18 | - label: I have searched the existing issues 19 | required: true 20 | - type: textarea 21 | attributes: 22 | label: Current Behavior 23 | description: A concise description of what you're experiencing. 24 | validations: 25 | required: false 26 | - type: textarea 27 | attributes: 28 | label: Expected Behavior 29 | description: A concise description of what you expected to happen. 30 | validations: 31 | required: false 32 | - type: textarea 33 | attributes: 34 | label: Steps To Reproduce 35 | description: Steps to reproduce the behavior. 36 | placeholder: | 37 | Steps to Reproduce (for bugs) 38 | Provide a link to a live example, or an unambiguous set of steps to reproduce this bug. Include code to reproduce, if relevant. Example: 39 | Download the example file uploaded here 40 | Start Spark from command line as spark-shell --packages dev.mauch:spark-excel_2.12:x.y.z --foo=bar 41 | Read the downloaded example file 42 | val df = spark.read 43 | .format("dev.mauch.spark.excel") 44 | .option("dataAddress", "'My Sheet'!B3:C35") 45 | .load("example_file_exhibiting_bug.xlsx") 46 | validations: 47 | required: false 48 | - type: textarea 49 | attributes: 50 | label: Environment 51 | description: | 52 | examples: 53 | Include as many relevant details about the environment you experienced the bug in 54 | Spark version and language (Scala, Java, Python, R, ...): 55 | Spark-Excel version: 56 | Operating System and versioncluster environment, ...: 57 | value: | 58 | - Spark version: 59 | - Spark-Excel version: 60 | - OS: 61 | - Cluster environment 62 | render: markdown 63 | validations: 64 | required: false 65 | - type: textarea 66 | attributes: 67 | label: Anything else? 68 | description: | 69 | Links? References? Anything that will give us more context about the issue you are encountering! 70 | 71 | Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in. 72 | validations: 73 | required: false 74 | -------------------------------------------------------------------------------- /.github/dependabot.yml: -------------------------------------------------------------------------------- 1 | # To get started with Dependabot version updates, you'll need to specify which 2 | # package ecosystems to update and where the package manifests are located. 3 | # Please see the documentation for all configuration options: 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates 5 | 6 | version: 2 7 | updates: 8 | - package-ecosystem: "github-actions" # See documentation for possible values 9 | directory: "/" # Location of package manifests 10 | schedule: 11 | interval: "weekly" 12 | -------------------------------------------------------------------------------- /.github/workflows/changelog.yaml: -------------------------------------------------------------------------------- 1 | name: Changelog 2 | 3 | on: 4 | push: 5 | tags: 6 | - v[0-9]+.[0-9]+.[0-9]+ 7 | 8 | jobs: 9 | deploy: 10 | runs-on: ubuntu-latest 11 | 12 | steps: 13 | - name: Checkout Code 14 | uses: actions/checkout@v4 15 | 16 | - name: Update CHANGELOG 17 | id: changelog 18 | uses: Requarks/changelog-action@v1 19 | with: 20 | token: ${{ github.token }} 21 | tag: ${{ github.ref_name }} 22 | 23 | - name: Commit CHANGELOG.md 24 | uses: stefanzweifel/git-auto-commit-action@v5 25 | with: 26 | branch: main 27 | commit_message: 'docs: update CHANGELOG.md for ${{ github.ref_name }} [skip ci]' 28 | file_pattern: CHANGELOG.md 29 | -------------------------------------------------------------------------------- /.github/workflows/ci.yml: -------------------------------------------------------------------------------- 1 | name: Continuous Integration 2 | 3 | on: 4 | pull_request: 5 | branches: ['**', '!update/**', '!pr/**'] 6 | push: 7 | branches: ['**', '!update/**', '!pr/**'] 8 | tags: [v*] 9 | 10 | env: 11 | PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }} 12 | SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }} 13 | SONATYPE_CREDENTIAL_HOST: ${{ secrets.SONATYPE_CREDENTIAL_HOST }} 14 | SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }} 15 | PGP_SECRET: ${{ secrets.PGP_SECRET }} 16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 17 | 18 | jobs: 19 | prepare: 20 | runs-on: ubuntu-latest 21 | outputs: 22 | matrix: ${{ steps.set-matrix.outputs.matrix }} 23 | steps: 24 | - name: Checkout 25 | uses: actions/checkout@v4 26 | 27 | - name: Generate matrix 28 | id: set-matrix 29 | run: | 30 | echo -n "matrix=" >> $GITHUB_OUTPUT 31 | ./mill resolve "spark-excel[_,_]" | \ 32 | jq -Rsc 'split("\n") | map(capture("spark-excel\\[(?<scala>[^,]+),(?<spark>[^\\]]+)\\]") | select(.)) | {include: .}' >> $GITHUB_OUTPUT 33 | 34 | build: 35 | needs: prepare 36 | name: Build and Test 37 | strategy: 38 | fail-fast: false 39 | matrix: ${{ fromJson(needs.prepare.outputs.matrix) }} 40 | runs-on: ubuntu-latest 41 | steps: 42 | - name: Checkout current branch (full) 43 | uses: actions/checkout@v4 44 | with: 45 | fetch-depth: 0 46 | 47 | - name: Download Java (temurin@11) 48 | id: download-java-temurin-11 49 | uses: typelevel/download-java@v2 50 | with: 51 | distribution: temurin 52 | java-version: 11 53 | 54 | - name: Setup Java (temurin@11) 55 | uses: actions/setup-java@v4 56 | with: 57 | distribution: jdkfile 58 | java-version: 11 59 | jdkFile: ${{ steps.download-java-temurin-11.outputs.jdkFile }} 60 | 61 | - name: Cache mill 62 | uses: actions/cache@v4 63 | with: 64 | path: | 65 | ~/.mill 66 | ~/.ivy2/cache 67 | ~/.coursier/cache/v1 68 | ~/.cache/coursier/v1 69 | ~/AppData/Local/Coursier/Cache/v1 70 | ~/Library/Caches/Coursier/v1 71 | key: ${{ runner.os }}-mill-cache-v2-${{ hashFiles('**/*.mill') }}-${{ hashFiles('project/build.properties') }} 72 | 73 | - name: Test 74 | run: ./mill spark-excel[${{ matrix.scala }},${{ matrix.spark }}].test 75 | 76 | - name: Publish Test Report 77 | uses: mikepenz/action-junit-report@v5 78 | if: always() # always run even if the previous step fails 79 | with: 80 | fail_on_failure: false 81 | include_passed: false 82 | detailed_summary: true 83 | annotate_only: true 84 | require_tests: false 85 | report_paths: 'out/**/test-report.xml' 86 | 87 | publish: 88 | name: Publish Artifacts 89 | needs: [build] 90 | if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main') 91 | strategy: 92 | matrix: 93 | os: [ubuntu-latest] 94 | scala: [2.12.20] 95 | java: [temurin@11] 96 | runs-on: ${{ matrix.os }} 97 | steps: 98 | - name: Checkout current branch (full) 99 | uses: actions/checkout@v4 100 | with: 101 | fetch-depth: 0 102 | 103 | - name: Download Java (temurin@11) 104 | id: download-java-temurin-11 105 | if: matrix.java == 'temurin@11' 106 | uses: typelevel/download-java@v2 107 | with: 108 | distribution: temurin 109 | java-version: 11 110 | 111 | - name: Setup Java (temurin@11) 112 | if: matrix.java == 'temurin@11' 113 | uses: actions/setup-java@v4 114 | with: 115 | distribution: jdkfile 116 | java-version: 11 117 | jdkFile: ${{ steps.download-java-temurin-11.outputs.jdkFile }} 118 | 119 | - name: Cache mill 120 | uses: actions/cache@v4 121 | with: 122 | path: | 123 | ~/.mill 124 | ~/.ivy2/cache 125 | ~/.coursier/cache/v1 126 | ~/.cache/coursier/v1 127 | ~/AppData/Local/Coursier/Cache/v1 128 | ~/Library/Caches/Coursier/v1 129 | key: ${{ runner.os }}-mill-cache-v2-${{ hashFiles('**/*.mill') }}-${{ hashFiles('project/build.properties') }} 130 | 131 | - name: Import GPG Key 132 | uses: crazy-max/ghaction-import-gpg@v6 133 | with: 134 | gpg_private_key: ${{ secrets.PGP_SECRET }} 135 | passphrase: ${{ secrets.PGP_PASSPHRASE }} 136 | trust_level: 5 137 | 138 | - name: Publish 139 | run: | 140 | export GPG_TTY=$(tty) 141 | ./mill -i mill.scalalib.SonatypeCentralPublishModule/ \ 142 | --username $SONATYPE_USERNAME \ 143 | --password $SONATYPE_PASSWORD \ 144 | --gpgArgs "--passphrase=$PGP_PASSPHRASE,--no-tty,--pinentry-mode,loopback,--batch,--yes,-a,-b" \ 145 | --bundleName dev.mauch-spark-excel-$(date +%Y-%m-%d-%H-%M) 146 | -------------------------------------------------------------------------------- /.github/workflows/clean.yml: -------------------------------------------------------------------------------- 1 | name: Clean 2 | 3 | on: push 4 | 5 | jobs: 6 | delete-artifacts: 7 | name: Delete Artifacts 8 | runs-on: ubuntu-latest 9 | env: 10 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 11 | steps: 12 | - name: Delete artifacts 13 | run: | 14 | # Customize those three lines with your repository and credentials: 15 | REPO=${GITHUB_API_URL}/repos/${{ github.repository }} 16 | 17 | # A shortcut to call GitHub API. 18 | ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; } 19 | 20 | # A temporary file which receives HTTP response headers. 21 | TMPFILE=/tmp/tmp.$$ 22 | 23 | # An associative array, key: artifact name, value: number of artifacts of that name. 24 | declare -A ARTCOUNT 25 | 26 | # Process all artifacts on this repository, loop on returned "pages". 27 | URL=$REPO/actions/artifacts 28 | while [[ -n "$URL" ]]; do 29 | 30 | # Get current page, get response headers in a temporary file. 31 | JSON=$(ghapi --dump-header $TMPFILE "$URL") 32 | 33 | # Get URL of next page. Will be empty if we are at the last page. 34 | URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*<//' -e 's/>.*//') 35 | rm -f $TMPFILE 36 | 37 | # Number of artifacts on this page: 38 | COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') )) 39 | 40 | # Loop on all artifacts on this page. 41 | for ((i=0; $i < $COUNT; i++)); do 42 | 43 | # Get name of artifact and count instances of this name. 44 | name=$(jq <<<$JSON -r ".artifacts[$i].name?") 45 | ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1)) 46 | 47 | id=$(jq <<<$JSON -r ".artifacts[$i].id?") 48 | size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") )) 49 | printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size 50 | ghapi -X DELETE $REPO/actions/artifacts/$id 51 | done 52 | done 53 | -------------------------------------------------------------------------------- /.github/workflows/dependency-graph.yml: -------------------------------------------------------------------------------- 1 | name: github-dependency-graph 2 | 3 | on: 4 | push: 5 | branches: 6 | - main 7 | 8 | jobs: 9 | submit-dependency-graph: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - uses: coursier/cache-action@v6 14 | - uses: actions/setup-java@v4 15 | with: 16 | distribution: 'temurin' 17 | java-version: '17' 18 | - uses: ckipp01/mill-dependency-submission@v1 19 | -------------------------------------------------------------------------------- /.github/workflows/potential-duplicates.yml: -------------------------------------------------------------------------------- 1 | name: Potential Duplicates 2 | on: 3 | issues: 4 | types: [opened, edited] 5 | jobs: 6 | run: 7 | runs-on: ubuntu-latest 8 | steps: 9 | - uses: wow-actions/potential-duplicates@v1 10 | with: 11 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 12 | # Issue title filter work with anymatch https://www.npmjs.com/package/anymatch. 13 | # Any matched issue will stop detection immediately. 14 | # You can specify multi filters in each line. 15 | filter: '' 16 | # Exclude keywords in title before detecting. 17 | exclude: '' 18 | # Label to set, when potential duplicates are detected. 19 | label: potential-duplicate 20 | # Get issues with state to compare. Supported state: 'all', 'closed', 'open'. 21 | state: all 22 | # If similarity is higher than this threshold([0,1]), issue will be marked as duplicate. 23 | threshold: 0.6 24 | # Reactions to be add to comment when potential duplicates are detected. 25 | # Available reactions: "-1", "+1", "confused", "laugh", "heart", "hooray", "rocket", "eyes" 26 | reactions: 'eyes, confused' 27 | # Comment to post when potential duplicates are detected. 28 | comment: > 29 | Please check these potential duplicates: {{#issues}} 30 | - [#{{ number }}] {{ title }} ({{ accuracy }}%) 31 | {{/issues}} 32 | 33 | If this issue is a duplicate, please add any additional info to the ticket with the most information and close this one. 34 | -------------------------------------------------------------------------------- /.github/workflows/pr-agent.yaml: -------------------------------------------------------------------------------- 1 | on: 2 | pull_request: 3 | issue_comment: 4 | jobs: 5 | pr_agent_job: 6 | runs-on: ubuntu-latest 7 | permissions: 8 | issues: write 9 | pull-requests: write 10 | contents: write 11 | name: Run pr agent on every pull request, respond to user comments 12 | steps: 13 | - name: PR Agent action step 14 | id: pragent 15 | uses: Codium-ai/pr-agent@main 16 | env: 17 | OPENAI_KEY: ${{ secrets.OPENAI_KEY }} 18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 19 | -------------------------------------------------------------------------------- /.github/workflows/rebase.yml: -------------------------------------------------------------------------------- 1 | name: Automatic Rebase 2 | on: 3 | issue_comment: 4 | types: [created] 5 | jobs: 6 | rebase: 7 | name: Rebase 8 | if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/rebase') 9 | runs-on: ubuntu-latest 10 | steps: 11 | - name: Checkout the latest code 12 | uses: actions/checkout@v4 13 | with: 14 | token: ${{ secrets.GITHUB_TOKEN }} 15 | fetch-depth: 0 # otherwise, you will fail to push refs to dest repo 16 | - name: Automatic Rebase 17 | uses: cirrus-actions/rebase@1.8 18 | env: 19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 20 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | project/target/ 3 | project/project/ 4 | out/ 5 | *.p12 6 | .ensime* 7 | *.swp 8 | .idea 9 | *.log 10 | 11 | .metals/ 12 | project/metals.sbt 13 | **/.bsp/ 14 | **/.bloop/ 15 | .vscode 16 | private-key.pem 17 | .secrets 18 | .~lock.*.xlsx# 19 | -------------------------------------------------------------------------------- /.mill-jvm-opts: -------------------------------------------------------------------------------- 1 | -Xmx4G 2 | -------------------------------------------------------------------------------- /.mill-version: -------------------------------------------------------------------------------- 1 | 0.12.14 2 | -------------------------------------------------------------------------------- /.scala-steward.conf: -------------------------------------------------------------------------------- 1 | updatePullRequests = "always" 2 | commits.message = "chore: Update ${artifactName} from ${currentVersion} to ${nextVersion}" 3 | -------------------------------------------------------------------------------- /.scalafmt.conf: -------------------------------------------------------------------------------- 1 | version = 3.9.7 2 | style = default 3 | runner.dialect=scala212 4 | maxColumn = 120 5 | continuationIndent.defnSite = 2 6 | continuationIndent.callSite = 2 7 | align.preset = "none" 8 | danglingParentheses.preset = true 9 | optIn.configStyleArguments = false 10 | docstrings.style = SpaceAsterisk 11 | spaces.beforeContextBoundColon = true 12 | rewrite.rules = [SortImports] 13 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing 2 | 3 | When contributing to this repository, please first discuss the change you wish to make via an issue 4 | with the owners of this repository before making a change. 5 | 6 | ## Pull Request Process 7 | 8 | 1. Unless the changes are trivial extensions or bugfixes, 9 | please create an issue proposing what you want to change first. 10 | 2. After coordination with the project maintainers, 11 | go ahead and create the PR. 12 | 3. If you want to do larger refactorings that are not obviously necessary for the PR 13 | please coordinate with the project maintainers first. 14 | We're open to refactorings but would like to discuss and review them independently. 15 | 4. Auto-format your code using `mill mill.scalalib.scalafmt.ScalafmtModule/reformatAll __.sources`. 16 | 5. Run all tests locally using `mill spark-excel[__].test`. 17 | 6. Update the `README.md` and `CHANGELOG.md` with details of changes to the interface. 18 | 7. Rebase your changes to the latest master in case something changed there. 19 | -------------------------------------------------------------------------------- /ISSUE_TEMPLATE.md: -------------------------------------------------------------------------------- 1 | Your issue may already be reported! 2 | Please search on the [issue track](../) before creating one. 3 | Moreover, please read the [`CHANGELOG.md`](../../blob/master/CHANGELOG.md) file for any changes you might have missed. 4 | 5 | ## Expected Behavior 6 | > If you're describing a bug, tell us what should happen 7 | > If you're suggesting a change/improvement, tell us how it should work 8 | 9 | ## Current Behavior 10 | > If describing a bug, tell us what happens instead of the expected behavior 11 | > If suggesting a change/improvement, explain the difference from current behavior. 12 | > If you have a stack trace or any helpful information from the console, paste it in its entirety. 13 | > If the problem happens with a certain file, upload it somewhere and paste a link. 14 | 15 | ## Possible Solution 16 | > Not obligatory, but suggest a fix/reason for the bug, 17 | > or ideas how to implement the addition or change 18 | 19 | ## Steps to Reproduce (for bugs) 20 | > Provide a link to a live example, or an unambiguous set of steps to 21 | > reproduce this bug. Include code to reproduce, if relevant. 22 | > Example: 23 | 1. Download the example file uploaded [here](http://example.com/) 24 | 2. Start Spark from command line as `spark-shell --packages dev.mauch:spark-excel_2.12:x.y.z --foo=bar` 25 | 3. Read the downloaded example file 26 | ``` 27 | val df = spark.read 28 | .format("dev.mauch.spark.excel") 29 | .option("dataAddress", "'My Sheet'!B3:C35") 30 | .load("example_file_exhibiting_bug.xlsx") 31 | ``` 32 | 33 | ## Context 34 | > How has this issue affected you? What are you trying to accomplish? 35 | > Providing context helps us come up with a solution that is most useful in the real world 36 | 37 | ## Your Environment 38 | > Include as many relevant details about the environment you experienced the bug in 39 | * Spark version and language (Scala, Java, Python, R, ...): 40 | * Spark-Excel version: 41 | * Operating System and version, cluster environment, ...: 42 | -------------------------------------------------------------------------------- /build.mill: -------------------------------------------------------------------------------- 1 | import coursier.maven.MavenRepository 2 | import mill._, scalalib._, publish._ 3 | import Assembly._ 4 | import $ivy.`de.tototec::de.tobiasroeser.mill.vcs.version::0.4.0` 5 | import de.tobiasroeser.mill.vcs.version.VcsVersion 6 | 7 | trait SparkModule extends Cross.Module2[String, String] with SbtModule with SonatypeCentralPublishModule { 8 | outer => 9 | override def scalaVersion = crossValue 10 | val sparkVersion = crossValue2 11 | val Array(sparkMajor, sparkMinor, sparkPatch) = sparkVersion.split("\\.") 12 | val sparkBinaryVersion = s"$sparkMajor.$sparkMinor" 13 | 14 | override def millSourcePath = super.millSourcePath / os.up 15 | 16 | object LowerOrEqual { 17 | def unapply(otherVersion: String): Boolean = otherVersion match { 18 | case s"${sparkMaj}.${sparkMin}.${sparkPat}" => 19 | sparkMaj == sparkMajor && (sparkMin < sparkMinor || (sparkMin == sparkMinor && sparkPat <= sparkPatch)) 20 | case s"${sparkMaj}.${sparkMin}" => sparkMaj == sparkMajor && sparkMin <= sparkMinor 21 | case sparkMaj => sparkMaj == sparkMajor 22 | } 23 | } 24 | object HigherOrEqual { 25 | def unapply(otherVersion: String): Boolean = otherVersion match { 26 | case s"${sparkMaj}.${sparkMin}.${sparkPat}" => 27 | sparkMaj == sparkMajor && (sparkMin > sparkMinor || (sparkMin == sparkMinor && sparkPat >= sparkPatch)) 28 | case s"${sparkMaj}.${sparkMin}" => sparkMaj == sparkMajor && sparkMin >= sparkMinor 29 | case sparkMaj => sparkMaj == sparkMajor 30 | } 31 | } 32 | 33 | def sparkVersionSpecificSources = T { 34 | val versionSpecificDirs = os.list(mill.api.WorkspaceRoot.workspaceRoot / "src" / "main") 35 | val Array(sparkMajor, sparkMinor, sparkPatch) = sparkVersion.split("\\.") 36 | val sparkBinaryVersion = s"$sparkMajor.$sparkMinor" 37 | versionSpecificDirs.filter(_.last match { 38 | case "scala" => true 39 | case `sparkBinaryVersion` => true 40 | case s"${LowerOrEqual()}_and_up" => true 41 | case s"${LowerOrEqual()}_to_${HigherOrEqual()}" => true 42 | case _ => false 43 | }) 44 | } 45 | override def sources = T.sources { 46 | super.sources() ++ sparkVersionSpecificSources().map(PathRef(_)) 47 | } 48 | 49 | override def docSources = T.sources(Seq[PathRef]()) 50 | 51 | override def artifactName = "spark-excel" 52 | 53 | override def publishVersion: T[String] = T { 54 | val vcsVersion = VcsVersion.vcsState().format(untaggedSuffix = "-SNAPSHOT") 55 | s"${sparkVersion}_${vcsVersion}" 56 | } 57 | def pomSettings = PomSettings( 58 | description = "A Spark plugin for reading and writing Excel files", 59 | organization = "dev.mauch", 60 | url = "https://github.com/nightscape/spark-excel", 61 | licenses = Seq(License.`Apache-2.0`), 62 | versionControl = VersionControl.github("nightscape", "spark-excel"), 63 | developers = Seq(Developer("nightscape", "Martin Mauch", "https://github.com/nightscape")) 64 | ) 65 | 66 | def assemblyRules = Seq( 67 | Rule.AppendPattern(".*\\.conf"), // all *.conf files will be concatenated into single file 68 | Rule.Relocate("org.apache.commons.io.**", "shadeio.commons.io.@1"), 69 | Rule.Relocate("org.apache.commons.compress.**", "shadeio.commons.compress.@1") 70 | ) 71 | 72 | override def extraPublish = Seq( 73 | PublishInfo(assembly(), classifier = None, ivyConfig = "compile"), 74 | PublishInfo(jar(), classifier = Some("thin"), ivyConfig = "compile") 75 | ) 76 | 77 | override def sonatypeCentralReadTimeout: T[Int] = 600000 78 | override def sonatypeCentralAwaitTimeout: T[Int] = 1200 * 1000 79 | 80 | val sparkDeps = Agg( 81 | ivy"org.apache.spark::spark-core:$sparkVersion", 82 | ivy"org.apache.spark::spark-sql:$sparkVersion", 83 | ivy"org.apache.spark::spark-hive:$sparkVersion" 84 | ) 85 | 86 | override def compileIvyDeps = if (sparkVersion < "3.3.0") { 87 | sparkDeps ++ Agg(ivy"org.slf4j:slf4j-api:1.7.36".excludeOrg("stax")) 88 | } else { 89 | sparkDeps 90 | } 91 | 92 | val poiVersion = "5.4.1" 93 | 94 | override def ivyDeps = { 95 | val base = Agg( 96 | ivy"org.apache.poi:poi:$poiVersion", 97 | ivy"org.apache.poi:poi-ooxml:$poiVersion", 98 | ivy"org.apache.poi:poi-ooxml-lite:$poiVersion", 99 | ivy"org.apache.xmlbeans:xmlbeans:5.3.0", 100 | ivy"com.norbitltd::spoiwo:2.2.1", 101 | ivy"com.github.pjfanning:excel-streaming-reader:5.1.0", 102 | ivy"commons-io:commons-io:2.19.0", 103 | ivy"org.apache.commons:commons-compress:1.27.1", 104 | ivy"org.apache.logging.log4j:log4j-api:2.24.3", 105 | ivy"com.zaxxer:SparseBitSet:1.3", 106 | ivy"org.apache.commons:commons-collections4:4.5.0", 107 | ivy"com.github.virtuald:curvesapi:1.08", 108 | ivy"commons-codec:commons-codec:1.18.0", 109 | ivy"org.apache.commons:commons-math3:3.6.1", 110 | ivy"org.scala-lang.modules::scala-collection-compat:2.13.0" 111 | ) 112 | if (sparkVersion >= "3.3.0") { 113 | base ++ Agg(ivy"org.apache.logging.log4j:log4j-core:2.24.3") 114 | } else { 115 | base 116 | } 117 | } 118 | 119 | object test extends SbtTests with TestModule.ScalaTest { 120 | 121 | override def millSourcePath = super.millSourcePath 122 | 123 | override def sources = T.sources { 124 | Seq(PathRef(millSourcePath / "src" / "test" / "scala")) 125 | } 126 | 127 | override def resources = T.sources { 128 | Seq(PathRef(millSourcePath / "src" / "test" / "resources")) 129 | } 130 | 131 | def scalaVersion = outer.scalaVersion() 132 | 133 | def repositoriesTask = T.task { 134 | super.repositoriesTask() ++ Seq(MavenRepository("https://jitpack.io")) 135 | } 136 | 137 | def ivyDeps = sparkDeps ++ Agg( 138 | ivy"org.typelevel::cats-core:2.13.0", 139 | ivy"org.scalatest::scalatest:3.2.19", 140 | ivy"org.scalatestplus::scalacheck-1-16:3.2.14.0", 141 | ivy"org.scalacheck::scalacheck:1.18.1", 142 | ivy"com.github.alexarchambault::scalacheck-shapeless_1.15:1.3.0", 143 | ivy"com.github.mrpowers::spark-fast-tests:1.3.0", 144 | ivy"org.scalamock::scalamock:5.2.0" 145 | ) 146 | } 147 | 148 | } 149 | 150 | val scala213 = "2.13.16" 151 | val scala212 = "2.12.20" 152 | val spark24 = List("2.4.8") 153 | val spark30 = List("3.0.3") 154 | val spark31 = List("3.1.3") 155 | val spark32 = List("3.2.4") 156 | val spark33 = List("3.3.4") 157 | val spark34 = List("3.4.4", "3.4.1") 158 | val spark35 = List("3.5.6") 159 | val sparkVersions = spark24 ++ spark30 ++ spark31 ++ spark32 ++ spark33 ++ spark34 ++ spark35 160 | val crossMatrix = 161 | sparkVersions.map(spark => (scala212, spark)) ++ 162 | sparkVersions.filter(_ >= "3.2").map(spark => (scala213, spark)) 163 | 164 | object `spark-excel` extends Cross[SparkModule](crossMatrix) {} 165 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | # Documentation 2 | 3 | We need help here! Please send us a PR with any examples or documentation you 4 | care to write that may be of help to others. 5 | 6 | ## Example Notebook 7 | 8 | [spark_excel_examples.ipynb](spark_excel_examples.ipynb) contains examples in 9 | a notebook format. 10 | 11 | ## Azure Synapse 12 | 13 | [azure_synapse.md](azure_synapse.md) has some instructions for loading spark-excel into a 14 | Spark pool for Azure Synapse. -------------------------------------------------------------------------------- /docs/azure_synapse.md: -------------------------------------------------------------------------------- 1 | # Azure Synapse 2 | 3 | Adding the spark-excel library to the Spark workspace will enable reading 4 | and writing of Excel files to an Azure Storage Account. 5 | 6 | At the time of writing, the following libraries have to be added to the 7 | workspace and then configured for each Spark Pool. 8 | 9 | Each library can be downloaded from [Maven Central](https://search.maven.org) 10 | (thanks Sonatype!). 11 | 12 | * spark-excel_2.12-3.1.2_0.16.5-pre2.jar 13 | * log4j-core-2.17.2.jar 14 | * log4j-api-2.17.2.jar 15 | * xmlbeans-5.0.3.jar 16 | * poi-ooxml-lite-5.2.2.jar 17 | * commons-collections4-4.4.jar 18 | 19 | Once those have been applied, the Excel files can be read into a dataframe like so: 20 | 21 | ``` 22 | excel_path = "abfss://<container>@<storage account>.dfs.core.windows.net/<path to excel>" 23 | df = (spark.read 24 | .format("excel") 25 | .load(excel_path) 26 | ) 27 | display(df) 28 | ``` 29 | -------------------------------------------------------------------------------- /private-key.pem.enc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/private-key.pem.enc -------------------------------------------------------------------------------- /src/README.md: -------------------------------------------------------------------------------- 1 | Spark-excel Source Code Structure 2 | ================================= 3 | 4 | Spark-excel, under the hood, there are two implementations. From spark-excel 0.14.0 we added spark-excel V2, which uses Spark Data Source API V2. 5 | 6 | These two implementations are compatible with each other in terms of options and behavior. However, there are features from spark-excel V2 that are not available in original spark-excel implementation, or example: loading multiple Excel files, corrupted record handling eg. 7 | 8 | Spark DataSource API V2 is still under development, since spark 2.3. And to keep spark-excel V2 code to minimum, spark-excel V2 heavily relies on utilities and improvements of each upstream spark version. 9 | 10 | Spark-excel V2 introduces spark-version specific code folder, like: 11 | `2.4/.../spark/v2/excel` for Spark 2.4 Data Source API V2 12 | `3.x/.../spark/v2/excel` for all Spark 3.* Data Source API V2 13 | `3.1_3.2/.../spark/v2/excel` for shared code between Spark 3.1 and Spark 3.2 Data Source API V2 14 | 15 | These structures are also configured into [build.sc](https://github.dev/mauch/spark-excel/blob/main/build.sc#L13), so it can compile for each Spark version. 16 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/ExcelSparkInternal.scala: -------------------------------------------------------------------------------- 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 4 | * the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | * specific language governing permissions and limitations under the License. 11 | */ 12 | package org.apache.spark.nightscape 13 | 14 | import java.nio.file.{Files, Paths} 15 | import org.apache.spark.rdd.InputFileBlockHolder 16 | 17 | /** To provide input-file-name value. The sole purpose of this is for proxying into spark internal implementation of 18 | * InputFileBlockHolder 19 | */ 20 | object ExcelSparkInternal { 21 | def setInputFileName(path: String): Unit = { 22 | InputFileBlockHolder.set(path, 0, Files.size(Paths.get(path))) 23 | } 24 | } 25 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala: -------------------------------------------------------------------------------- 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 4 | * the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | * specific language governing permissions and limitations under the License. 11 | */ 12 | package dev.mauch.spark.excel.v2 13 | 14 | import org.apache.spark.unsafe.types.UTF8String 15 | import org.apache.spark.sql.catalyst.util._ 16 | import java.time.ZoneId 17 | import org.apache.spark.sql.catalyst.util.TimestampFormatter 18 | 19 | /** Wrapping the API change between spark 3.0 vs 3.1 */ 20 | object ExcelDateTimeStringUtils { 21 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = { 22 | val str = UTF8String.fromString(v) 23 | DateTimeUtils.stringToTimestamp(str, java.util.TimeZone.getTimeZone(zoneId)) 24 | } 25 | 26 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = { 27 | val str = UTF8String.fromString(v) 28 | DateTimeUtils.stringToDate(str) 29 | } 30 | 31 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = 32 | TimestampFormatter(options.timestampFormat, java.util.TimeZone.getTimeZone(options.zoneId), options.locale) 33 | 34 | def getDateFormatter(options: ExcelOptions): DateFormatter = 35 | DateFormatter(options.dateFormat, options.locale) 36 | } 37 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala: -------------------------------------------------------------------------------- 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 4 | * the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | * specific language governing permissions and limitations under the License. 11 | */ 12 | package dev.mauch.spark.excel.v2 13 | 14 | import org.apache.spark.sql.sources 15 | import org.apache.spark.sql.types.StructType 16 | import org.apache.spark.sql.catalyst.InternalRow 17 | 18 | /** Wrapping the API change between spark 3.0 vs 3.1 */ 19 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType) { 20 | def skipRow(row: InternalRow, index: Int): Boolean = { false } 21 | } 22 | 23 | object ExcelFilters { 24 | def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] = 25 | filters 26 | } 27 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 20 | import org.apache.spark.sql.internal.SQLConf 21 | 22 | class ExcelOptions( 23 | @transient 24 | val parameters: CaseInsensitiveMap[String], 25 | val defaultTimeZoneId: String, 26 | val defaultColumnNameOfCorruptRecord: String 27 | ) extends ExcelOptionsTrait 28 | with Serializable { 29 | // all parameter handling is implemented in ExcelOptionsTrait 30 | 31 | def this(parameters: Map[String, String], defaultTimeZoneId: String) = { 32 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord) 33 | } 34 | 35 | def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = { 36 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException 20 | import org.apache.spark.unsafe.types.UTF8String 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | 23 | trait ExcelParserBase { 24 | 25 | protected def getCurrentInput: UTF8String 26 | def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException = 27 | BadRecordException(() => getCurrentInput, () => partialResults.headOption, baseException) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/excel/v2/FailureSafeParser.scala: -------------------------------------------------------------------------------- 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 4 | * the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | * specific language governing permissions and limitations under the License. 11 | */ 12 | package dev.mauch.spark.excel.v2 13 | 14 | import org.apache.spark.SparkException 15 | import org.apache.spark.sql.catalyst.InternalRow 16 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow 17 | import org.apache.spark.sql.types.StructType 18 | import org.apache.spark.unsafe.types.UTF8String 19 | import org.apache.spark.sql.catalyst.util._ 20 | 21 | class FailureSafeParser[IN]( 22 | rawParser: IN => Iterable[InternalRow], 23 | mode: ParseMode, 24 | schema: StructType, 25 | columnNameOfCorruptRecord: String 26 | ) { 27 | 28 | private val corruptFieldIndex = 29 | if (schema.fieldNames.contains(columnNameOfCorruptRecord)) { 30 | Some(schema.fieldIndex(columnNameOfCorruptRecord)) 31 | } else None 32 | 33 | private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord)) 34 | private val resultRow = new GenericInternalRow(schema.length) 35 | private val nullResult = new GenericInternalRow(schema.length) 36 | 37 | // This function takes 2 parameters: an optional partial result, and the bad record. If the given 38 | // schema doesn't contain a field for corrupted record, we just return the partial result or a 39 | // row with all fields null. If the given schema contains a field for corrupted record, we will 40 | // set the bad record to this field, and set other fields according to the partial result or null. 41 | private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = { 42 | if (corruptFieldIndex.isDefined) { (row, badRecord) => 43 | { 44 | var i = 0 45 | while (i < actualSchema.length) { 46 | val from = actualSchema(i) 47 | resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull 48 | i += 1 49 | } 50 | resultRow(corruptFieldIndex.get) = badRecord() 51 | resultRow 52 | } 53 | } else { (row, _) => row.getOrElse(nullResult) } 54 | } 55 | 56 | def parse(input: IN): Iterator[InternalRow] = { 57 | try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) } 58 | catch { 59 | case e: BadRecordException => 60 | mode match { 61 | case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record)) 62 | case DropMalformedMode => Iterator.empty 63 | case FailFastMode => 64 | throw new SparkException( 65 | "Malformed records are detected in record parsing. " + 66 | s"Parse Mode: ${FailFastMode.name}. To process malformed records as null " + 67 | "result, try setting the option 'mode' as 'PERMISSIVE'.", 68 | e 69 | ) 70 | } 71 | } 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/2.4/scala/dev/mauch/spark/excel/v2/SchemaUtils.scala: -------------------------------------------------------------------------------- 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 4 | * the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | * specific language governing permissions and limitations under the License. 11 | */ 12 | package dev.mauch.spark.excel.v2 13 | 14 | import org.apache.spark.sql.catalyst.analysis._ 15 | import org.apache.spark.sql.types.StructType 16 | 17 | /** Utils for handling schemas. (Copied from spark.util) 18 | */ 19 | object SchemaUtils { 20 | 21 | /** Checks if an input schema has duplicate column names. This throws an exception if the duplication exists. 22 | * 23 | * @param schema 24 | * schema to check 25 | * @param colType 26 | * column type name, used in an exception message 27 | * @param caseSensitiveAnalysis 28 | * whether duplication checks should be case sensitive or not 29 | */ 30 | def checkSchemaColumnNameDuplication( 31 | schema: StructType, 32 | colType: String, 33 | caseSensitiveAnalysis: Boolean = false 34 | ): Unit = { checkColumnNameDuplication(schema.map(_.name), colType, caseSensitiveAnalysis) } 35 | 36 | // Returns true if a given resolver is case-sensitive 37 | private def isCaseSensitiveAnalysis(resolver: Resolver): Boolean = { 38 | if (resolver == caseSensitiveResolution) { true } 39 | else if (resolver == caseInsensitiveResolution) { false } 40 | else { 41 | sys.error( 42 | "A resolver to check if two identifiers are equal must be " + 43 | "`caseSensitiveResolution` or `caseInsensitiveResolution` in o.a.s.sql.catalyst." 44 | ) 45 | } 46 | } 47 | 48 | /** Checks if input column names have duplicate identifiers. This throws an exception if the duplication exists. 49 | * 50 | * @param columnNames 51 | * column names to check 52 | * @param colType 53 | * column type name, used in an exception message 54 | * @param resolver 55 | * resolver used to determine if two identifiers are equal 56 | */ 57 | def checkColumnNameDuplication(columnNames: Seq[String], colType: String, resolver: Resolver): Unit = { 58 | checkColumnNameDuplication(columnNames, colType, isCaseSensitiveAnalysis(resolver)) 59 | } 60 | 61 | /** Checks if input column names have duplicate identifiers. This throws an exception if the duplication exists. 62 | * 63 | * @param columnNames 64 | * column names to check 65 | * @param colType 66 | * column type name, used in an exception message 67 | * @param caseSensitiveAnalysis 68 | * whether duplication checks should be case sensitive or not 69 | */ 70 | def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = { 71 | val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase) 72 | if (names.distinct.length != names.length) { 73 | val duplicateColumns = names 74 | .groupBy(identity) 75 | .collect { case (x, ys) if ys.length > 1 => s"`$x`" } 76 | throw new RuntimeException(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}") 77 | } 78 | } 79 | } 80 | -------------------------------------------------------------------------------- /src/main/3.0/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.unsafe.types.UTF8String 20 | import org.apache.spark.sql.catalyst.util._ 21 | import java.time.ZoneId 22 | import org.apache.spark.sql.catalyst.util.TimestampFormatter 23 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT 24 | 25 | /** Wrapping the API change between spark 3.0 vs 3.1 */ 26 | object ExcelDateTimeStringUtils { 27 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = { 28 | val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(v)) 29 | DateTimeUtils.stringToTimestamp(str, zoneId) 30 | } 31 | 32 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = { 33 | val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(v)) 34 | DateTimeUtils.stringToDate(str, zoneId) 35 | } 36 | 37 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter( 38 | options.timestampFormat, 39 | options.zoneId, 40 | options.locale, 41 | legacyFormat = FAST_DATE_FORMAT, 42 | isParsing = true 43 | ) 44 | 45 | def getDateFormatter(options: ExcelOptions): DateFormatter = 46 | DateFormatter(options.dateFormat, options.zoneId, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true) 47 | } 48 | -------------------------------------------------------------------------------- /src/main/3.0/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.catalyst.csv.CSVFilters 20 | import org.apache.spark.sql.sources 21 | import org.apache.spark.sql.types.StructType 22 | 23 | /** Wrapping the API change between spark 3.0 vs 3.1 */ 24 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType) 25 | extends CSVFilters(filters, requiredSchema) {} 26 | 27 | object ExcelFilters { 28 | def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] = 29 | CSVFilters.pushedFilters(filters, schema) 30 | } 31 | -------------------------------------------------------------------------------- /src/main/3.0_and_up/scala/dev/mauch/spark/excel/v2/ExcelDataSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.connector.catalog.Table 20 | import org.apache.spark.sql.execution.datasources.FileFormat 21 | import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2 22 | import org.apache.spark.sql.types.StructType 23 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 24 | 25 | /** Derived from Spark own CSV implementation 26 | */ 27 | class ExcelDataSource extends FileDataSourceV2 { 28 | 29 | override def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ExcelFileFormat] 30 | 31 | override def getTable(options: CaseInsensitiveStringMap): Table = { 32 | val paths = getPaths(options) 33 | val tableName = getTableName(options, paths) 34 | val optionsWithoutPaths = getOptionsWithoutPaths(options) 35 | ExcelTable(tableName, sparkSession, optionsWithoutPaths, paths, None) 36 | } 37 | 38 | override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = { 39 | val paths = getPaths(options) 40 | val tableName = getTableName(options, paths) 41 | val optionsWithoutPaths = getOptionsWithoutPaths(options) 42 | ExcelTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema)) 43 | } 44 | 45 | /** The string that represents the format that this data source provider uses 46 | */ 47 | override def shortName(): String = "excel" 48 | 49 | } 50 | -------------------------------------------------------------------------------- /src/main/3.0_and_up/scala/dev/mauch/spark/excel/v2/ExcelFileFormat.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.conf.Configuration 20 | import org.apache.hadoop.fs.{FileStatus, Path} 21 | import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext} 22 | import org.apache.spark.sql.SparkSession 23 | import org.apache.spark.sql.catalyst.InternalRow 24 | import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory, PartitionedFile} 25 | import org.apache.spark.sql.sources.{DataSourceRegister, Filter} 26 | import org.apache.spark.sql.types._ 27 | 28 | /** derived from binary file data source. Needed to support writing excel using the V2 API 29 | */ 30 | class ExcelFileFormat extends FileFormat with DataSourceRegister { 31 | 32 | override def inferSchema( 33 | sparkSession: SparkSession, 34 | options: Map[String, String], 35 | files: Seq[FileStatus] 36 | ): Option[StructType] = { 37 | throw new UnsupportedOperationException("ExcelFileFormat as fallback format for V2 supports writing only") 38 | } 39 | 40 | override def prepareWrite( 41 | sparkSession: SparkSession, 42 | job: Job, 43 | options: Map[String, String], 44 | dataSchema: StructType 45 | ): OutputWriterFactory = { 46 | val excelOptions = new ExcelOptions(options, sparkSession.conf.get("spark.sql.session.timeZone")) 47 | 48 | new OutputWriterFactory { 49 | override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { 50 | new ExcelOutputWriter(path, dataSchema, context, excelOptions) 51 | } 52 | 53 | override def getFileExtension(context: TaskAttemptContext): String = 54 | s".${excelOptions.fileExtension}" 55 | } 56 | } 57 | 58 | override def isSplitable(sparkSession: SparkSession, options: Map[String, String], path: Path): Boolean = { 59 | false 60 | } 61 | 62 | override def shortName(): String = "excel" 63 | 64 | /* 65 | We need this class for writing only, thus reader is not implemented 66 | */ 67 | override protected def buildReader( 68 | sparkSession: SparkSession, 69 | dataSchema: StructType, 70 | partitionSchema: StructType, 71 | requiredSchema: StructType, 72 | filters: Seq[Filter], 73 | options: Map[String, String], 74 | hadoopConf: Configuration 75 | ): PartitionedFile => Iterator[InternalRow] = { 76 | throw new UnsupportedOperationException("ExcelFileFormat as fallback format for V2 supports writing only") 77 | } 78 | 79 | } 80 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelOutputWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.mapreduce.TaskAttemptContext 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | import org.apache.spark.sql.execution.datasources.OutputWriter 23 | import org.apache.spark.sql.types.StructType 24 | 25 | class ExcelOutputWriter(path: String, dataSchema: StructType, context: TaskAttemptContext, options: ExcelOptions) 26 | extends OutputWriter 27 | with Logging { 28 | 29 | private val gen = new ExcelGenerator(path, dataSchema, context.getConfiguration, options) 30 | if (options.header) { gen.writeHeaders() } 31 | 32 | override def write(row: InternalRow): Unit = gen.write(row) 33 | 34 | override def close(): Unit = gen.close() 35 | } 36 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.fs.FileStatus 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo 22 | import org.apache.spark.sql.connector.write.WriteBuilder 23 | import org.apache.spark.sql.execution.datasources.FileFormat 24 | import org.apache.spark.sql.execution.datasources.v2.FileTable 25 | import org.apache.spark.sql.types.DataType 26 | import org.apache.spark.sql.types.StructType 27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 28 | import org.apache.spark.sql.connector.catalog.TableCapability 29 | import org.apache.spark.sql.connector.catalog.TableCapability._ 30 | import scala.jdk.CollectionConverters._ 31 | 32 | case class ExcelTable( 33 | name: String, 34 | sparkSession: SparkSession, 35 | map: CaseInsensitiveStringMap, 36 | paths: Seq[String], 37 | userSpecifiedSchema: Option[StructType] 38 | ) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) { 39 | 40 | override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder = 41 | ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params) 42 | 43 | override def inferSchema(files: Seq[FileStatus]): Option[StructType] = { 44 | val options = 45 | new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone) 46 | 47 | if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty)) 48 | } 49 | 50 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = 51 | new ExcelWriteBuilder(paths, formatName, supportsDataType, info) 52 | 53 | override def supportsDataType(dataType: DataType): Boolean = true 54 | 55 | override def formatName: String = "Excel" 56 | 57 | override def fallbackFileFormat: Class[_ <: FileFormat] = 58 | throw new UnsupportedOperationException("Excel does not support V1 File Format") 59 | 60 | override def capabilities: java.util.Set[TableCapability] = 61 | Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava 62 | 63 | /* Actual doing schema inferring */ 64 | private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = { 65 | val excelHelper = ExcelHelper(options) 66 | val conf = sparkSession.sessionState.newHadoopConf() 67 | 68 | /** Sampling ratio on file level (not row level as in CSV) */ 69 | val paths = { 70 | var sample = (inputPaths.size * options.samplingRatio).intValue 71 | sample = if (sample < 1) 1 else sample 72 | inputPaths.take(sample).map(_.getPath.toUri) 73 | } 74 | val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths) 75 | try { 76 | if (sheetData.rowIterator.isEmpty) { 77 | /* If the first file is empty, not checking further */ 78 | StructType(Seq.empty) 79 | } else { 80 | /* Ready to infer schema */ 81 | ExcelInferSchema(options).infer(sheetData.rowIterator, colNames) 82 | } 83 | } finally { 84 | sheetData.close() 85 | } 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelWriteBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.mapreduce.Job 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext 21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo 22 | import org.apache.spark.sql.execution.datasources.OutputWriter 23 | import org.apache.spark.sql.execution.datasources.OutputWriterFactory 24 | import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder 25 | import org.apache.spark.sql.internal.SQLConf 26 | import org.apache.spark.sql.types.DataType 27 | import org.apache.spark.sql.types.StructType 28 | 29 | class ExcelWriteBuilder( 30 | paths: Seq[String], 31 | formatName: String, 32 | supportsDataType: DataType => Boolean, 33 | info: LogicalWriteInfo 34 | ) extends FileWriteBuilder(paths, formatName, supportsDataType, info) { 35 | override def prepareWrite( 36 | sqlConf: SQLConf, 37 | job: Job, 38 | options: Map[String, String], 39 | dataSchema: StructType 40 | ): OutputWriterFactory = { 41 | 42 | val excelOptions = new ExcelOptions(options, sqlConf.sessionLocalTimeZone) 43 | 44 | new OutputWriterFactory { 45 | override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { 46 | new ExcelOutputWriter(path, dataSchema, context, excelOptions) 47 | } 48 | 49 | override def getFileExtension(context: TaskAttemptContext): String = 50 | s".${excelOptions.fileExtension}" 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.2/scala/dev/mauch/spark/excel/v2/ExcelScan.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression} 22 | import org.apache.spark.sql.connector.read.PartitionReaderFactory 23 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex 24 | import org.apache.spark.sql.execution.datasources.v2.{FileScan, TextBasedFileScan} 25 | import org.apache.spark.sql.sources.Filter 26 | import org.apache.spark.sql.types.StructType 27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 28 | import org.apache.spark.util.SerializableConfiguration 29 | 30 | import scala.collection.compat.immutable.ArraySeq 31 | import scala.jdk.CollectionConverters._ 32 | 33 | case class ExcelScan( 34 | sparkSession: SparkSession, 35 | fileIndex: PartitioningAwareFileIndex, 36 | dataSchema: StructType, 37 | readDataSchema: StructType, 38 | readPartitionSchema: StructType, 39 | options: CaseInsensitiveStringMap, 40 | pushedFilters: Array[Filter], 41 | partitionFilters: Seq[Expression] = Seq.empty, 42 | dataFilters: Seq[Expression] = Seq.empty 43 | ) extends TextBasedFileScan(sparkSession, options) { 44 | 45 | private lazy val parsedOptions: ExcelOptions = new ExcelOptions( 46 | options.asScala.toMap, 47 | sparkSession.sessionState.conf.sessionLocalTimeZone, 48 | sparkSession.sessionState.conf.columnNameOfCorruptRecord 49 | ) 50 | 51 | override def isSplitable(path: Path): Boolean = false 52 | 53 | override def getFileUnSplittableReason(path: Path): String = { 54 | "No practical method of splitting an excel file" 55 | } 56 | 57 | override def createReaderFactory(): PartitionReaderFactory = { 58 | 59 | /* Check a field requirement for corrupt records here to throw an exception in a driver side 60 | */ 61 | ExprUtils.verifyColumnNameOfCorruptRecord(dataSchema, parsedOptions.columnNameOfCorruptRecord) 62 | 63 | if ( 64 | readDataSchema.length == 1 && 65 | readDataSchema.head.name == parsedOptions.columnNameOfCorruptRecord 66 | ) { 67 | throw new RuntimeException( 68 | "Queries from raw Excel files are disallowed when the referenced " + 69 | "columns only include the internal corrupt record column" 70 | ) 71 | } 72 | 73 | val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap 74 | 75 | /* Hadoop Configurations are case sensitive. */ 76 | val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) 77 | 78 | val broadcastedConf = sparkSession.sparkContext 79 | .broadcast(new SerializableConfiguration(hadoopConf)) 80 | 81 | /* The partition values are already truncated in `FileScan.partitions`. We should use `readPartitionSchema` as the 82 | * partition schema here. 83 | */ 84 | ExcelPartitionReaderFactory( 85 | sparkSession.sessionState.conf, 86 | broadcastedConf, 87 | dataSchema, 88 | readDataSchema, 89 | readPartitionSchema, 90 | parsedOptions, 91 | ArraySeq.unsafeWrapArray(pushedFilters) 92 | ) 93 | } 94 | 95 | override def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan = 96 | this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters) 97 | 98 | override def equals(obj: Any): Boolean = obj match { 99 | case c: ExcelScan => 100 | super.equals(c) && dataSchema == c.dataSchema && options == c.options && 101 | equivalentFilters(pushedFilters, c.pushedFilters) 102 | case _ => false 103 | } 104 | 105 | override def hashCode(): Int = super.hashCode() 106 | 107 | override def description(): String = { 108 | super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]") 109 | } 110 | } 111 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.2/scala/dev/mauch/spark/excel/v2/ExcelScanBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters} 21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex 22 | import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder 23 | import org.apache.spark.sql.sources.Filter 24 | import org.apache.spark.sql.types.StructType 25 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 26 | 27 | case class ExcelScanBuilder( 28 | sparkSession: SparkSession, 29 | fileIndex: PartitioningAwareFileIndex, 30 | schema: StructType, 31 | dataSchema: StructType, 32 | options: CaseInsensitiveStringMap 33 | ) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) 34 | with SupportsPushDownFilters { 35 | 36 | override def build(): Scan = { 37 | ExcelScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedFilters()) 38 | } 39 | 40 | private var _pushedFilters: Array[Filter] = Array.empty 41 | 42 | override def pushFilters(filters: Array[Filter]): Array[Filter] = { 43 | _pushedFilters = ExcelFilters.pushedFilters(filters, dataSchema) 44 | filters 45 | } 46 | 47 | override def pushedFilters(): Array[Filter] = _pushedFilters 48 | } 49 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.3/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 20 | import org.apache.spark.sql.internal.SQLConf 21 | 22 | class ExcelOptions( 23 | @transient 24 | val parameters: CaseInsensitiveMap[String], 25 | val defaultTimeZoneId: String, 26 | val defaultColumnNameOfCorruptRecord: String 27 | ) extends ExcelOptionsTrait 28 | with Serializable { 29 | // all parameter handling is implemented in ExcelOptionsTrait 30 | 31 | def this(parameters: Map[String, String], defaultTimeZoneId: String) = { 32 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord) 33 | } 34 | 35 | def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = { 36 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.4.1/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException 20 | import org.apache.spark.unsafe.types.UTF8String 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | 23 | trait ExcelParserBase { 24 | 25 | protected def getCurrentInput: UTF8String 26 | def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException = 27 | BadRecordException(() => getCurrentInput, () => partialResults.headOption, baseException) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/3.0_to_3.4.1/scala/dev/mauch/spark/excel/v2/ExcelPartitionReaderFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.conf.Configuration 20 | import org.apache.spark.broadcast.Broadcast 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | import org.apache.spark.sql.connector.read.PartitionReader 23 | import org.apache.spark.sql.execution.datasources.PartitionedFile 24 | import org.apache.spark.sql.execution.datasources.v2._ 25 | import org.apache.spark.sql.internal.SQLConf 26 | import org.apache.spark.sql.sources.Filter 27 | import org.apache.spark.sql.types.StructType 28 | import org.apache.spark.util.SerializableConfiguration 29 | 30 | import java.net.URI 31 | import scala.util.control.NonFatal 32 | 33 | /** A factory used to create Excel readers. 34 | * 35 | * @param sqlConf 36 | * SQL configuration. 37 | * @param broadcastedConf 38 | * Broadcasted serializable Hadoop Configuration. 39 | * @param dataSchema 40 | * Schema of Excel files. 41 | * @param readDataSchema 42 | * Required data schema in the batch scan. 43 | * @param partitionSchema 44 | * Schema of partitions. 45 | * @param options 46 | * Options for parsing Excel files. 47 | */ 48 | case class ExcelPartitionReaderFactory( 49 | sqlConf: SQLConf, 50 | broadcastedConf: Broadcast[SerializableConfiguration], 51 | dataSchema: StructType, 52 | readDataSchema: StructType, 53 | partitionSchema: StructType, 54 | options: ExcelOptions, 55 | filters: Seq[Filter] 56 | ) extends FilePartitionReaderFactory { 57 | 58 | override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { 59 | val conf = broadcastedConf.value.value 60 | val actualDataSchema = 61 | StructType(dataSchema.filterNot(_.name == options.columnNameOfCorruptRecord)) 62 | val actualReadDataSchema = 63 | StructType(readDataSchema.filterNot(_.name == options.columnNameOfCorruptRecord)) 64 | val parser = new ExcelParser(actualDataSchema, actualReadDataSchema, options, filters) 65 | val headerChecker = 66 | new ExcelHeaderChecker(actualReadDataSchema, options, source = s"Excel file: ${file.filePath}") 67 | val iter = readFile(conf, file, parser, headerChecker, readDataSchema) 68 | val partitionReader = new SparkExcelPartitionReaderFromIterator(iter) 69 | new PartitionReaderWithPartitionValues(partitionReader, readDataSchema, partitionSchema, file.partitionValues) 70 | } 71 | 72 | private def readFile( 73 | conf: Configuration, 74 | file: PartitionedFile, 75 | parser: ExcelParser, 76 | headerChecker: ExcelHeaderChecker, 77 | requiredSchema: StructType 78 | ): SheetData[InternalRow] = { 79 | val excelHelper = ExcelHelper(options) 80 | val sheetData = excelHelper.getSheetData(conf, URI.create(file.filePath.toString)) 81 | try { 82 | SheetData( 83 | ExcelParser.parseIterator(sheetData.rowIterator, parser, headerChecker, requiredSchema), 84 | sheetData.resourcesToClose 85 | ) 86 | } catch { 87 | case NonFatal(t) => { 88 | sheetData.close() 89 | throw t 90 | } 91 | } 92 | } 93 | 94 | } 95 | 96 | private class SparkExcelPartitionReaderFromIterator(sheetData: SheetData[InternalRow]) 97 | extends PartitionReaderFromIterator[InternalRow](sheetData.rowIterator) { 98 | override def close(): Unit = { 99 | super.close() 100 | sheetData.close() 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/main/3.1/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala: -------------------------------------------------------------------------------- 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape) 2 | * 3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with 4 | * the License. You may obtain a copy of the License at 5 | * 6 | * http://www.apache.org/licenses/LICENSE-2.0 7 | * 8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on 9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the 10 | * specific language governing permissions and limitations under the License. 11 | */ 12 | package dev.mauch.spark.excel.v2 13 | 14 | import org.apache.spark.unsafe.types.UTF8String 15 | import org.apache.spark.sql.catalyst.util._ 16 | import java.time.ZoneId 17 | import org.apache.spark.sql.catalyst.util.TimestampFormatter 18 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT 19 | 20 | object ExcelDateTimeStringUtils { 21 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = { 22 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v)) 23 | DateTimeUtils.stringToTimestamp(str, zoneId) 24 | } 25 | 26 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = { 27 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v)) 28 | DateTimeUtils.stringToDate(str, zoneId) 29 | } 30 | 31 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter( 32 | options.timestampFormat, 33 | options.zoneId, 34 | options.locale, 35 | legacyFormat = FAST_DATE_FORMAT, 36 | isParsing = true 37 | ) 38 | 39 | def getDateFormatter(options: ExcelOptions): DateFormatter = 40 | DateFormatter(options.dateFormat, options.zoneId, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true) 41 | 42 | } 43 | -------------------------------------------------------------------------------- /src/main/3.1_and_up/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.catalyst.OrderedFilters 20 | import org.apache.spark.sql.catalyst.StructFilters 21 | import org.apache.spark.sql.sources 22 | import org.apache.spark.sql.types.StructType 23 | 24 | /** Wrapping the API change between spark 3.0 vs 3.1 */ 25 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType) 26 | extends OrderedFilters(filters, requiredSchema) {} 27 | 28 | object ExcelFilters { 29 | def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] = 30 | StructFilters.pushedFilters(filters, schema) 31 | } 32 | -------------------------------------------------------------------------------- /src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.unsafe.types.UTF8String 20 | import org.apache.spark.sql.catalyst.util._ 21 | 22 | import java.time.ZoneId 23 | import org.apache.spark.sql.catalyst.util.DateFormatter 24 | import org.apache.spark.sql.catalyst.util.TimestampFormatter 25 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT 26 | 27 | import scala.annotation.nowarn 28 | 29 | object ExcelDateTimeStringUtils { 30 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = { 31 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v)) 32 | DateTimeUtils.stringToTimestamp(str, zoneId) 33 | } 34 | 35 | @nowarn 36 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = { 37 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v)) 38 | DateTimeUtils.stringToDate(str) 39 | } 40 | 41 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter( 42 | options.timestampFormat, 43 | options.zoneId, 44 | options.locale, 45 | legacyFormat = FAST_DATE_FORMAT, 46 | isParsing = true 47 | ) 48 | 49 | def getDateFormatter(options: ExcelOptions): DateFormatter = 50 | DateFormatter(options.dateFormat, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true) 51 | 52 | } 53 | -------------------------------------------------------------------------------- /src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelOutputWriter.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.mapreduce.TaskAttemptContext 20 | import org.apache.spark.internal.Logging 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | import org.apache.spark.sql.execution.datasources.OutputWriter 23 | import org.apache.spark.sql.types.StructType 24 | 25 | class ExcelOutputWriter(val path: String, dataSchema: StructType, context: TaskAttemptContext, options: ExcelOptions) 26 | extends OutputWriter 27 | with Logging { 28 | 29 | private val gen = new ExcelGenerator(path, dataSchema, context.getConfiguration, options) 30 | if (options.header) { gen.writeHeaders() } 31 | 32 | override def write(row: InternalRow): Unit = gen.write(row) 33 | 34 | override def close(): Unit = gen.close() 35 | } 36 | -------------------------------------------------------------------------------- /src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelTable.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.fs.FileStatus 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.connector.write.Write 22 | import org.apache.spark.sql.connector.write.LogicalWriteInfo 23 | import org.apache.spark.sql.connector.write.WriteBuilder 24 | import org.apache.spark.sql.execution.datasources.FileFormat 25 | import org.apache.spark.sql.execution.datasources.v2.FileTable 26 | import org.apache.spark.sql.types.DataType 27 | import org.apache.spark.sql.types.StructType 28 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 29 | import org.apache.spark.sql.connector.catalog.TableCapability 30 | import org.apache.spark.sql.connector.catalog.TableCapability._ 31 | import scala.jdk.CollectionConverters._ 32 | 33 | case class ExcelTable( 34 | name: String, 35 | sparkSession: SparkSession, 36 | map: CaseInsensitiveStringMap, 37 | paths: Seq[String], 38 | userSpecifiedSchema: Option[StructType] 39 | ) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) { 40 | 41 | override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder = 42 | ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params) 43 | 44 | override def inferSchema(files: Seq[FileStatus]): Option[StructType] = { 45 | val options = 46 | new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone) 47 | 48 | if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty)) 49 | } 50 | 51 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder = 52 | new WriteBuilder { 53 | override def build(): Write = ExcelWriteBuilder(paths, formatName, supportsDataType, info) 54 | } 55 | 56 | override def supportsDataType(dataType: DataType): Boolean = true 57 | 58 | override def formatName: String = "Excel" 59 | 60 | override def fallbackFileFormat: Class[_ <: FileFormat] = 61 | throw new UnsupportedOperationException("Excel does not support V1 File Format") 62 | 63 | override def capabilities: java.util.Set[TableCapability] = 64 | Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava 65 | 66 | /* Actual doing schema inferring */ 67 | private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = { 68 | val excelHelper = ExcelHelper(options) 69 | val conf = sparkSession.sessionState.newHadoopConf() 70 | 71 | /* Sampling ratio on file level (not row level as in CSV) */ 72 | val paths = { 73 | var sample = (inputPaths.size * options.samplingRatio).intValue 74 | sample = if (sample < 1) 1 else sample 75 | inputPaths.take(sample).map(_.getPath.toUri) 76 | } 77 | val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths) 78 | try { 79 | if (sheetData.rowIterator.isEmpty) { 80 | /* If the first file is empty, not checking further */ 81 | StructType(Seq.empty) 82 | } else { 83 | /* Ready to infer schema */ 84 | ExcelInferSchema(options).infer(sheetData.rowIterator, colNames) 85 | } 86 | } finally { 87 | sheetData.close() 88 | } 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelWriteBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.mapreduce.Job 20 | import org.apache.hadoop.mapreduce.TaskAttemptContext 21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo 22 | import org.apache.spark.sql.execution.datasources.OutputWriter 23 | import org.apache.spark.sql.execution.datasources.OutputWriterFactory 24 | import org.apache.spark.sql.execution.datasources.v2.FileWrite 25 | import org.apache.spark.sql.internal.SQLConf 26 | import org.apache.spark.sql.types.DataType 27 | import org.apache.spark.sql.types.StructType 28 | 29 | case class ExcelWriteBuilder( 30 | paths: Seq[String], 31 | formatName: String, 32 | supportsDataType: DataType => Boolean, 33 | info: LogicalWriteInfo 34 | ) extends FileWrite { 35 | override def prepareWrite( 36 | sqlConf: SQLConf, 37 | job: Job, 38 | options: Map[String, String], 39 | dataSchema: StructType 40 | ): OutputWriterFactory = { 41 | 42 | val excelOptions = new ExcelOptions(options, sqlConf.sessionLocalTimeZone) 43 | 44 | new OutputWriterFactory { 45 | override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = { 46 | new ExcelOutputWriter(path, dataSchema, context, excelOptions) 47 | } 48 | 49 | override def getFileExtension(context: TaskAttemptContext): String = 50 | s".${excelOptions.fileExtension}" 51 | } 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/main/3.3_and_up/scala/dev/mauch/spark/excel/v2/ExcelScan.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.sql.SparkSession 21 | import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression} 22 | import org.apache.spark.sql.connector.read.PartitionReaderFactory 23 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex 24 | import org.apache.spark.sql.execution.datasources.v2.TextBasedFileScan 25 | import org.apache.spark.sql.sources.Filter 26 | import org.apache.spark.sql.types.StructType 27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 28 | import org.apache.spark.util.SerializableConfiguration 29 | 30 | import scala.collection.compat.immutable.ArraySeq 31 | import scala.jdk.CollectionConverters._ 32 | 33 | case class ExcelScan( 34 | sparkSession: SparkSession, 35 | fileIndex: PartitioningAwareFileIndex, 36 | dataSchema: StructType, 37 | readDataSchema: StructType, 38 | readPartitionSchema: StructType, 39 | options: CaseInsensitiveStringMap, 40 | pushedFilters: Array[Filter], 41 | partitionFilters: Seq[Expression] = Seq.empty, 42 | dataFilters: Seq[Expression] = Seq.empty 43 | ) extends TextBasedFileScan(sparkSession, options) { 44 | 45 | private lazy val parsedOptions: ExcelOptions = new ExcelOptions( 46 | options.asScala.toMap, 47 | sparkSession.sessionState.conf.sessionLocalTimeZone, 48 | sparkSession.sessionState.conf.columnNameOfCorruptRecord 49 | ) 50 | 51 | override def isSplitable(path: Path): Boolean = false 52 | 53 | override def getFileUnSplittableReason(path: Path): String = { 54 | "No practical method of splitting an excel file" 55 | } 56 | 57 | override def createReaderFactory(): PartitionReaderFactory = { 58 | 59 | /* Check a field requirement for corrupt records here to throw an exception in a driver side 60 | */ 61 | ExprUtils.verifyColumnNameOfCorruptRecord(dataSchema, parsedOptions.columnNameOfCorruptRecord) 62 | 63 | if ( 64 | readDataSchema.length == 1 && 65 | readDataSchema.head.name == parsedOptions.columnNameOfCorruptRecord 66 | ) { 67 | throw new RuntimeException( 68 | "Queries from raw Excel files are disallowed when the referenced " + 69 | "columns only include the internal corrupt record column" 70 | ) 71 | } 72 | 73 | val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap 74 | 75 | /* Hadoop Configurations are case sensitive. */ 76 | val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap) 77 | 78 | val broadcastedConf = sparkSession.sparkContext 79 | .broadcast(new SerializableConfiguration(hadoopConf)) 80 | 81 | /* The partition values are already truncated in `FileScan.partitions`. We should use `readPartitionSchema` as the 82 | * partition schema here. 83 | */ 84 | ExcelPartitionReaderFactory( 85 | sparkSession.sessionState.conf, 86 | broadcastedConf, 87 | dataSchema, 88 | readDataSchema, 89 | readPartitionSchema, 90 | parsedOptions, 91 | ArraySeq.unsafeWrapArray(pushedFilters) 92 | ) 93 | } 94 | 95 | override def equals(obj: Any): Boolean = obj match { 96 | case c: ExcelScan => 97 | super.equals(c) && dataSchema == c.dataSchema && options == c.options && 98 | equivalentFilters(pushedFilters, c.pushedFilters) 99 | case _ => false 100 | } 101 | 102 | override def hashCode(): Int = super.hashCode() 103 | 104 | override def description(): String = { 105 | super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]") 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /src/main/3.3_and_up/scala/dev/mauch/spark/excel/v2/ExcelScanBuilder.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.SparkSession 20 | import org.apache.spark.sql.connector.read.Scan 21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex 22 | import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder 23 | import org.apache.spark.sql.internal.connector.SupportsPushDownCatalystFilters 24 | import org.apache.spark.sql.types.StructType 25 | import org.apache.spark.sql.util.CaseInsensitiveStringMap 26 | 27 | case class ExcelScanBuilder( 28 | sparkSession: SparkSession, 29 | fileIndex: PartitioningAwareFileIndex, 30 | schema: StructType, 31 | dataSchema: StructType, 32 | options: CaseInsensitiveStringMap 33 | ) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) 34 | with SupportsPushDownCatalystFilters { 35 | 36 | override def build(): Scan = { 37 | ExcelScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedDataFilters) 38 | } 39 | 40 | } 41 | -------------------------------------------------------------------------------- /src/main/3.4.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException 20 | import org.apache.spark.unsafe.types.UTF8String 21 | import org.apache.spark.sql.catalyst.InternalRow 22 | 23 | trait ExcelParserBase { 24 | 25 | protected def getCurrentInput: UTF8String 26 | def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException = 27 | BadRecordException(() => getCurrentInput, () => partialResults, baseException) 28 | } 29 | -------------------------------------------------------------------------------- /src/main/3.4.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelPartitionReaderFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.hadoop.conf.Configuration 20 | import org.apache.spark.broadcast.Broadcast 21 | import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow} 22 | import org.apache.spark.sql.connector.read.PartitionReader 23 | import org.apache.spark.sql.execution.datasources.PartitionedFile 24 | import org.apache.spark.sql.execution.datasources.v2._ 25 | import org.apache.spark.sql.internal.SQLConf 26 | import org.apache.spark.sql.sources.Filter 27 | import org.apache.spark.sql.types.StructType 28 | import org.apache.spark.util.SerializableConfiguration 29 | 30 | import java.net.URI 31 | import scala.util.control.NonFatal 32 | 33 | /** A factory used to create Excel readers. 34 | * 35 | * @param sqlConf 36 | * SQL configuration. 37 | * @param broadcastedConf 38 | * Broadcasted serializable Hadoop Configuration. 39 | * @param dataSchema 40 | * Schema of Excel files. 41 | * @param readDataSchema 42 | * Required data schema in the batch scan. 43 | * @param partitionSchema 44 | * Schema of partitions. 45 | * @param parsedOptions 46 | * Options for parsing Excel files. 47 | */ 48 | case class ExcelPartitionReaderFactory( 49 | sqlConf: SQLConf, 50 | broadcastedConf: Broadcast[SerializableConfiguration], 51 | dataSchema: StructType, 52 | readDataSchema: StructType, 53 | partitionSchema: StructType, 54 | parsedOptions: ExcelOptions, 55 | filters: Seq[Filter] 56 | ) extends FilePartitionReaderFactory { 57 | protected def options: FileSourceOptions = new FileSourceOptions( 58 | Map(FileSourceOptions.IGNORE_CORRUPT_FILES -> "true", FileSourceOptions.IGNORE_MISSING_FILES -> "true") 59 | ) 60 | override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = { 61 | val conf = broadcastedConf.value.value 62 | val actualDataSchema = 63 | StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) 64 | val actualReadDataSchema = 65 | StructType(readDataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord)) 66 | val parser = new ExcelParser(actualDataSchema, actualReadDataSchema, parsedOptions, filters) 67 | val headerChecker = 68 | new ExcelHeaderChecker(actualReadDataSchema, parsedOptions, source = s"Excel file: ${file.filePath}") 69 | val iter = readFile(conf, file, parser, headerChecker, readDataSchema) 70 | val partitionReader = new SparkExcelPartitionReaderFromIterator(iter) 71 | new PartitionReaderWithPartitionValues(partitionReader, readDataSchema, partitionSchema, file.partitionValues) 72 | } 73 | 74 | private def readFile( 75 | conf: Configuration, 76 | file: PartitionedFile, 77 | parser: ExcelParser, 78 | headerChecker: ExcelHeaderChecker, 79 | requiredSchema: StructType 80 | ): SheetData[InternalRow] = { 81 | val excelHelper = ExcelHelper(parsedOptions) 82 | val sheetData = excelHelper.getSheetData(conf, URI.create(file.filePath.toString)) 83 | try { 84 | SheetData( 85 | ExcelParser.parseIterator(sheetData.rowIterator, parser, headerChecker, requiredSchema), 86 | sheetData.resourcesToClose 87 | ) 88 | } catch { 89 | case NonFatal(t) => { 90 | sheetData.close() 91 | throw t 92 | } 93 | } 94 | } 95 | 96 | } 97 | 98 | private class SparkExcelPartitionReaderFromIterator(sheetData: SheetData[InternalRow]) 99 | extends PartitionReaderFromIterator[InternalRow](sheetData.rowIterator) { 100 | override def close(): Unit = { 101 | super.close() 102 | sheetData.close() 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /src/main/3.4_and_up/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.catalyst.FileSourceOptions 20 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap 21 | import org.apache.spark.sql.internal.SQLConf 22 | 23 | class ExcelOptions( 24 | @transient 25 | val parameters: CaseInsensitiveMap[String], 26 | val defaultTimeZoneId: String, 27 | val defaultColumnNameOfCorruptRecord: String 28 | ) extends FileSourceOptions(parameters) 29 | with ExcelOptionsTrait { 30 | 31 | def this(parameters: Map[String, String], defaultTimeZoneId: String) = { 32 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord) 33 | } 34 | 35 | def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = { 36 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord) 37 | } 38 | 39 | } 40 | -------------------------------------------------------------------------------- /src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister: -------------------------------------------------------------------------------- 1 | dev.mauch.spark.excel.v2.ExcelDataSource 2 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/DefaultSource.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.hadoop.fs.Path 20 | import org.apache.spark.sql.sources._ 21 | import org.apache.spark.sql.types.StructType 22 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode} 23 | 24 | class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider { 25 | 26 | /** Creates a new relation for retrieving data from an Excel file 27 | */ 28 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): ExcelRelation = 29 | createRelation(sqlContext, parameters, null) 30 | 31 | /** Creates a new relation for retrieving data from an Excel file 32 | */ 33 | override def createRelation( 34 | sqlContext: SQLContext, 35 | parameters: Map[String, String], 36 | schema: StructType 37 | ): ExcelRelation = { 38 | val conf = sqlContext.sparkSession.sessionState.newHadoopConf() 39 | val wbReader = WorkbookReader(parameters, conf) 40 | val dataLocator = DataLocator(parameters) 41 | ExcelRelation( 42 | header = checkParameter(parameters, "header").toBoolean, 43 | treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(false)(_.toBoolean), 44 | setErrorCellsToFallbackValues = parameters.get("setErrorCellsToFallbackValues").fold(false)(_.toBoolean), 45 | usePlainNumberFormat = parameters.get("usePlainNumberFormat").fold(false)(_.toBoolean), 46 | userSchema = Option(schema), 47 | inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean), 48 | addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean), 49 | timestampFormat = parameters.get("timestampFormat"), 50 | dateFormat = parameters.get("dateFormat"), 51 | excerptSize = parameters.get("excerptSize").fold(10)(_.toInt), 52 | dataLocator = dataLocator, 53 | workbookReader = wbReader 54 | )(sqlContext) 55 | } 56 | 57 | override def createRelation( 58 | sqlContext: SQLContext, 59 | mode: SaveMode, 60 | parameters: Map[String, String], 61 | data: DataFrame 62 | ): BaseRelation = { 63 | val path = checkParameter(parameters, "path") 64 | val header = checkParameter(parameters, "header").toBoolean 65 | val filesystemPath = new Path(path) 66 | val conf = sqlContext.sparkSession.sessionState.newHadoopConf() 67 | val fs = filesystemPath.getFileSystem(conf) 68 | new ExcelFileSaver( 69 | fs, 70 | filesystemPath, 71 | data, 72 | saveMode = mode, 73 | header = header, 74 | dataLocator = DataLocator(parameters) 75 | ).save() 76 | 77 | createRelation(sqlContext, parameters, data.schema) 78 | } 79 | 80 | // Forces a Parameter to exist, otherwise an exception is thrown. 81 | private def checkParameter(map: Map[String, String], param: String): String = { 82 | if (!map.contains(param)) { 83 | throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.") 84 | } else { 85 | map.apply(param) 86 | } 87 | } 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/DefaultSource15.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.spark.sql.sources.DataSourceRegister 20 | 21 | class DefaultSource15 extends DefaultSource with DataSourceRegister { 22 | override def shortName(): String = "excel" 23 | } 24 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/ExcelFileSaver.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path} 20 | import org.apache.poi.xssf.usermodel.XSSFWorkbook 21 | import org.apache.spark.sql.{DataFrame, SaveMode} 22 | import spoiwo.model._ 23 | import spoiwo.natures.xlsx.Model2XlsxConversions._ 24 | 25 | import java.io.BufferedOutputStream 26 | import scala.jdk.CollectionConverters._ 27 | 28 | object ExcelFileSaver { 29 | final val DEFAULT_SHEET_NAME = "Sheet1" 30 | final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm" 31 | final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000" 32 | } 33 | 34 | class ExcelFileSaver( 35 | fs: FileSystem, 36 | location: Path, 37 | dataFrame: DataFrame, 38 | saveMode: SaveMode, 39 | dataLocator: DataLocator, 40 | header: Boolean = true 41 | ) { 42 | def save(): Unit = { 43 | def sheet(workbook: XSSFWorkbook) = { 44 | val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None 45 | val dataRows = dataFrame 46 | .toLocalIterator() 47 | .asScala 48 | .map(_.toSeq) 49 | dataLocator.toSheet(headerRow, dataRows, workbook) 50 | } 51 | val fileAlreadyExists = fs.exists(location) 52 | def writeToWorkbook(workbook: XSSFWorkbook): Unit = { 53 | Workbook(sheet(workbook)).writeToExisting(workbook) 54 | autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write) 55 | } 56 | (fileAlreadyExists, saveMode) match { 57 | case (false, _) | (_, SaveMode.Overwrite) => 58 | if (fileAlreadyExists) { 59 | fs.delete(location, true) 60 | } 61 | writeToWorkbook(new XSSFWorkbook()) 62 | case (true, SaveMode.ErrorIfExists) => 63 | sys.error(s"path $location already exists.") 64 | case (true, SaveMode.Ignore) => () 65 | case (true, SaveMode.Append) => 66 | val inputStream: FSDataInputStream = fs.open(location) 67 | val workbook = new XSSFWorkbook(inputStream) 68 | inputStream.close() 69 | writeToWorkbook(workbook) 70 | } 71 | } 72 | 73 | def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = { 74 | try { 75 | fun(closeable) 76 | } finally { 77 | closeable.close() 78 | } 79 | } 80 | } 81 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/InferSchema.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.spark.rdd.RDD 20 | import org.apache.spark.sql.types._ 21 | 22 | private[excel] object InferSchema { 23 | type CellType = Int 24 | 25 | /** Similar to the JSON schema inference. [[org.apache.spark.sql.execution.datasources.json.InferSchema]] 26 | * 1. Infer type of each row 2. Merge row types to find common type 3. Replace any null types with string type 27 | */ 28 | def apply(rowsRDD: RDD[Seq[DataType]]): Array[DataType] = { 29 | val startType: Array[DataType] = Array.empty 30 | val rootTypes: Array[DataType] = rowsRDD.aggregate(startType)(inferRowType, mergeRowTypes) 31 | 32 | rootTypes.map { 33 | case _: NullType => StringType 34 | case other => other 35 | } 36 | } 37 | 38 | private def inferRowType(rowSoFar: Array[DataType], next: Seq[DataType]): Array[DataType] = { 39 | val maxLength = math.max(rowSoFar.length, next.size) 40 | val defaultDataType: Int => DataType = (_ => NullType) 41 | val filledRowSoFar = Array.tabulate(maxLength)(n => rowSoFar.applyOrElse[Int, DataType](n, defaultDataType)) 42 | val filledNext = Array.tabulate(maxLength)(n => next.applyOrElse[Int, DataType](n, defaultDataType)) 43 | filledRowSoFar.zip(filledNext).map { case (r, n) => inferField(r, n) } 44 | } 45 | 46 | private[excel] def mergeRowTypes(first: Array[DataType], second: Array[DataType]): Array[DataType] = { 47 | first.zipAll(second, NullType, NullType).map { case ((a, b)) => 48 | findTightestCommonType(a, b).getOrElse(NullType) 49 | } 50 | } 51 | 52 | /** Infer type of string field. Given known type Double, and a string "1", there is no point checking if it is an Int, 53 | * as the final type must be Double or higher. 54 | */ 55 | private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = { 56 | // Defining a function to return the StringType constant is necessary in order to work around 57 | // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions; 58 | // see issue #128 for more details. 59 | def stringType(): DataType = { 60 | StringType 61 | } 62 | 63 | if (field == NullType) { 64 | typeSoFar 65 | } else { 66 | (typeSoFar, field) match { 67 | case (NullType, ct) => ct 68 | case (DoubleType, DoubleType) => DoubleType 69 | case (BooleanType, BooleanType) => BooleanType 70 | case (TimestampType, TimestampType) => TimestampType 71 | case (StringType, _) => stringType() 72 | case (_, _) => stringType() 73 | } 74 | } 75 | } 76 | 77 | /** Copied from internal Spark api [[org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion]] 78 | */ 79 | private val numericPrecedence: IndexedSeq[DataType] = 80 | IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType) 81 | 82 | /** Copied from internal Spark api [[org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion]] 83 | */ 84 | val findTightestCommonType: (DataType, DataType) => Option[DataType] = { 85 | case (t1, t2) if t1 == t2 => Some(t1) 86 | case (NullType, t1) => Some(t1) 87 | case (t1, NullType) => Some(t1) 88 | case (StringType, _) => Some(StringType) 89 | case (_, StringType) => Some(StringType) 90 | 91 | // Promote numeric types to the highest of the two and all numeric types to unlimited decimal 92 | case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) => 93 | val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2) 94 | Some(numericPrecedence(index)) 95 | 96 | case _ => None 97 | } 98 | } 99 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/PlainNumberFormat.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import java.math.BigDecimal 20 | import java.text.FieldPosition 21 | import java.text.Format 22 | import java.text.ParsePosition 23 | 24 | /** A format that formats a double as a plain string without rounding and scientific notation. All other operations are 25 | * unsupported. 26 | * @see 27 | * [[org.apache.poi.ss.usermodel.ExcelGeneralNumberFormat]] and SSNFormat from 28 | * [[org.apache.poi.ss.usermodel.DataFormatter]] from Apache POI. 29 | */ 30 | object PlainNumberFormat extends Format { 31 | 32 | override def format(number: AnyRef, toAppendTo: StringBuffer, pos: FieldPosition): StringBuffer = { 33 | // Convert to BigDecimal for formatting 34 | val bd = new BigDecimal(number.toString) 35 | // Check if the number is an integer (scale == 0 after stripping trailing zeros) 36 | val stripped = bd.stripTrailingZeros() 37 | if (stripped.scale() <= 0) { 38 | // It's an integer, format without decimal point 39 | toAppendTo.append(stripped.toBigInteger().toString()) 40 | } else { 41 | // It's not an integer, format as plain string 42 | toAppendTo.append(bd.toPlainString) 43 | } 44 | } 45 | 46 | override def parseObject(source: String, pos: ParsePosition): AnyRef = 47 | throw new UnsupportedOperationException() 48 | } 49 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/Utils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | import scala.util.{Success, Try} 19 | 20 | object Utils { 21 | implicit class RichTry[T](t: Try[T]) { 22 | def toEither: Either[Throwable, T] = t.transform(s => Success(Right(s)), f => Success(Left(f))).get 23 | } 24 | 25 | case class MapIncluding[K](keys: Seq[K], optionally: Seq[K] = Seq()) { 26 | def unapply[V](m: Map[K, V]): Option[(Seq[V], Seq[Option[V]])] = 27 | if (keys.forall(m.contains)) { 28 | Some((keys.map(m), optionally.map(m.get))) 29 | } else { 30 | None 31 | } 32 | } 33 | sealed trait MapRequirements[K] { 34 | type ResultType[V] 35 | def unapplySeq[V](m: Map[K, V]): Option[ResultType[V]] 36 | } 37 | case class RequiredKeys[K](keys: K*) extends MapRequirements[K] { 38 | type ResultType[V] = Seq[V] 39 | def unapplySeq[V](m: Map[K, V]): Option[Seq[V]] = 40 | if (keys.forall(m.contains)) { 41 | Some(keys.map(m)) 42 | } else { 43 | None 44 | } 45 | } 46 | case class OptionalKeys[K](keys: K*) extends MapRequirements[K] { 47 | type ResultType[V] = Seq[Option[V]] 48 | def unapplySeq[V](m: Map[K, V]): Option[Seq[Option[V]]] = Some(keys.map(m.get)) 49 | } 50 | case class MapWith[K]( 51 | requiredKeys: RequiredKeys[K] = RequiredKeys[K](), 52 | optionalKeys: OptionalKeys[K] = OptionalKeys[K]() 53 | ) { 54 | def unapply[V](m: Map[K, V]): Option[(requiredKeys.ResultType[V], optionalKeys.ResultType[V])] = 55 | for { 56 | req <- requiredKeys.unapplySeq(m) 57 | opt <- optionalKeys.unapplySeq(m) 58 | } yield (req, opt) 59 | } 60 | } 61 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/WorkbookReader.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import java.io.InputStream 20 | import dev.mauch.spark.excel.Utils.MapIncluding 21 | import com.github.pjfanning.xlsx.StreamingReader 22 | import org.apache.hadoop.conf.Configuration 23 | import org.apache.hadoop.fs.{FileSystem, Path} 24 | import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory} 25 | import org.apache.poi.hssf.usermodel.HSSFWorkbookFactory 26 | import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource 27 | import org.apache.poi.util.IOUtils 28 | import org.apache.poi.xssf.usermodel.XSSFWorkbookFactory 29 | import scala.collection.JavaConverters.mapAsScalaMapConverter 30 | 31 | trait WorkbookReader { 32 | protected def openWorkbook(): Workbook 33 | def withWorkbook[T](f: Workbook => T): T = { 34 | val workbook = openWorkbook() 35 | val res = f(workbook) 36 | workbook.close() 37 | res 38 | } 39 | def sheetNames: Seq[String] = { 40 | withWorkbook(workbook => 41 | for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield { 42 | workbook.getSheetAt(sheetIx).getSheetName() 43 | } 44 | ) 45 | } 46 | } 47 | 48 | object WorkbookReader { 49 | val WithLocationMaxRowsInMemoryAndPassword = 50 | MapIncluding( 51 | Seq("path"), 52 | optionally = Seq("maxRowsInMemory", "workbookPassword", "maxByteArraySize", "tempFileThreshold") 53 | ) 54 | 55 | WorkbookFactory.addProvider(new HSSFWorkbookFactory) 56 | WorkbookFactory.addProvider(new XSSFWorkbookFactory) 57 | 58 | def apply(parameters: java.util.HashMap[String, String], hadoopConfiguration: Configuration): WorkbookReader = { 59 | apply(parameters.asScala.toMap, hadoopConfiguration) 60 | } 61 | 62 | def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = { 63 | def readFromHadoop(location: String) = { 64 | val path = new Path(location) 65 | FileSystem.get(path.toUri, hadoopConfiguration).open(path) 66 | } 67 | parameters match { 68 | case WithLocationMaxRowsInMemoryAndPassword( 69 | Seq(location), 70 | Seq(Some(maxRowsInMemory), passwordOption, maxByteArraySizeOption, tempFileThreshold) 71 | ) => 72 | new StreamingWorkbookReader( 73 | readFromHadoop(location), 74 | passwordOption, 75 | maxRowsInMemory.toInt, 76 | maxByteArraySizeOption.map(_.toInt), 77 | tempFileThreshold.map(_.toInt) 78 | ) 79 | case WithLocationMaxRowsInMemoryAndPassword( 80 | Seq(location), 81 | Seq(None, passwordOption, maxByteArraySizeOption, tempFileThresholdOption) 82 | ) => 83 | new DefaultWorkbookReader( 84 | readFromHadoop(location), 85 | passwordOption, 86 | maxByteArraySizeOption.map(_.toInt), 87 | tempFileThresholdOption.map(_.toInt) 88 | ) 89 | } 90 | } 91 | } 92 | class DefaultWorkbookReader( 93 | inputStreamProvider: => InputStream, 94 | workbookPassword: Option[String], 95 | maxByteArraySize: Option[Int], 96 | tempFileThreshold: Option[Int] 97 | ) extends WorkbookReader { 98 | 99 | protected def openWorkbook(): Workbook = { 100 | maxByteArraySize.foreach { maxSize => 101 | IOUtils.setByteArrayMaxOverride(maxSize) 102 | } 103 | tempFileThreshold.foreach { threshold => 104 | ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(threshold) 105 | } 106 | workbookPassword 107 | .fold(WorkbookFactory.create(inputStreamProvider))(password => 108 | WorkbookFactory.create(inputStreamProvider, password) 109 | ) 110 | } 111 | } 112 | 113 | class StreamingWorkbookReader( 114 | inputStreamProvider: => InputStream, 115 | workbookPassword: Option[String], 116 | maxRowsInMem: Int, 117 | maxByteArraySize: Option[Int], 118 | tempFileThreshold: Option[Int] 119 | ) extends WorkbookReader { 120 | override protected def openWorkbook(): Workbook = { 121 | maxByteArraySize.foreach { maxSize => 122 | IOUtils.setByteArrayMaxOverride(maxSize) 123 | } 124 | tempFileThreshold.foreach { threshold => 125 | ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(threshold) 126 | } 127 | val builder = StreamingReader 128 | .builder() 129 | .rowCacheSize(maxRowsInMem) 130 | .bufferSize(4096) 131 | workbookPassword 132 | .fold(builder)(password => builder.password(password)) 133 | .open(inputStreamProvider) 134 | } 135 | } 136 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark 18 | 19 | import org.apache.poi.ss.usermodel.Row.MissingCellPolicy 20 | import org.apache.poi.ss.usermodel.{Cell, CellType, Row} 21 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter} 22 | import spoiwo.model.Sheet 23 | 24 | package object excel { 25 | implicit class RichRow(val row: Row) extends AnyVal { 26 | def eachCellIterator(startColumn: Int, endColumn: Int): Iterator[Option[Cell]] = 27 | new Iterator[Option[Cell]] { 28 | private val lastCellInclusive = row.getLastCellNum - 1 29 | private val endCol = Math.min(endColumn, Math.max(startColumn, lastCellInclusive)) 30 | require(startColumn >= 0 && startColumn <= endCol) 31 | 32 | private var nextCol = startColumn 33 | 34 | override def hasNext: Boolean = nextCol <= endCol && nextCol <= lastCellInclusive 35 | 36 | override def next(): Option[Cell] = { 37 | val next = 38 | if (nextCol > endCol) throw new NoSuchElementException(s"column index = $nextCol") 39 | else Option(row.getCell(nextCol, MissingCellPolicy.RETURN_NULL_AND_BLANK)) 40 | nextCol += 1 41 | next 42 | } 43 | } 44 | } 45 | 46 | implicit class RichCell(val cell: Cell) extends AnyVal { 47 | def value: Any = 48 | cell.getCellType match { 49 | case CellType.BLANK | CellType.ERROR | CellType._NONE => null 50 | case CellType.NUMERIC => cell.getNumericCellValue 51 | case CellType.STRING => cell.getStringCellValue 52 | case CellType.BOOLEAN => cell.getBooleanCellValue 53 | case CellType.FORMULA => 54 | cell.getCachedFormulaResultType match { 55 | case CellType.BLANK => null 56 | case CellType.NUMERIC => cell.getNumericCellValue 57 | case CellType.STRING => cell.getRichStringCellValue 58 | case CellType.BOOLEAN => cell.getBooleanCellValue 59 | case _ => null 60 | } 61 | } 62 | } 63 | 64 | implicit class RichSpoiwoSheet(val sheet: Sheet) extends AnyVal { 65 | def extractTableData(tableNumber: Int): Seq[Seq[Any]] = { 66 | val table = sheet.tables(tableNumber) 67 | val (startRow, endRow) = table.cellRange.rowRange 68 | val (startColumn, endColumn) = table.cellRange.columnRange 69 | val tableRows = sheet.rows.filter(r => r.index.exists((startRow to endRow).contains)) 70 | tableRows.map(_.cells.filter(_.index.exists((startColumn to endColumn).contains)).map(_.value).toSeq) 71 | } 72 | } 73 | 74 | implicit class ExcelDataFrameReader(val dataFrameReader: DataFrameReader) extends AnyVal { 75 | def excel( 76 | header: Boolean = true, 77 | treatEmptyValuesAsNulls: Boolean = false, 78 | setErrorCellsToFallbackValues: Boolean = false, 79 | inferSchema: Boolean = false, 80 | usePlainNumberFormat: Boolean = false, 81 | addColorColumns: Boolean = false, 82 | dataAddress: String = null, 83 | timestampFormat: String = null, 84 | maxRowsInMemory: java.lang.Integer = null, 85 | maxByteArraySize: java.lang.Integer = null, 86 | tempFileThreshold: java.lang.Integer = null, 87 | excerptSize: Int = 10, 88 | workbookPassword: String = null 89 | ): DataFrameReader = { 90 | Map( 91 | "header" -> header, 92 | "treatEmptyValuesAsNulls" -> treatEmptyValuesAsNulls, 93 | "setErrorCellsToFallbackValues" -> setErrorCellsToFallbackValues, 94 | "usePlainNumberFormat" -> usePlainNumberFormat, 95 | "inferSchema" -> inferSchema, 96 | "addColorColumns" -> addColorColumns, 97 | "dataAddress" -> dataAddress, 98 | "timestampFormat" -> timestampFormat, 99 | "maxRowsInMemory" -> maxRowsInMemory, 100 | "maxByteArraySize" -> maxByteArraySize, 101 | "tempFileThreshold" -> tempFileThreshold, 102 | "excerptSize" -> excerptSize, 103 | "workbookPassword" -> workbookPassword 104 | ).foldLeft(dataFrameReader.format("dev.mauch.spark.excel")) { case (dfReader, (key, value)) => 105 | value match { 106 | case null => dfReader 107 | case v => dfReader.option(key, v.toString) 108 | } 109 | } 110 | } 111 | } 112 | 113 | implicit class ExcelDataFrameWriter[T](val dataFrameWriter: DataFrameWriter[T]) extends AnyVal { 114 | def excel( 115 | header: Boolean = true, 116 | dataAddress: String = null, 117 | preHeader: String = null, 118 | dateFormat: String = null, 119 | timestampFormat: String = null, 120 | workbookPassword: String = null 121 | ): DataFrameWriter[T] = { 122 | Map( 123 | "header" -> header, 124 | "dataAddress" -> dataAddress, 125 | "dateFormat" -> dateFormat, 126 | "timestampFormat" -> timestampFormat, 127 | "workbookPassword" -> workbookPassword, 128 | "preHeader" -> preHeader 129 | ).foldLeft(dataFrameWriter.format("dev.mauch.spark.excel")) { case (dfWriter, (key, value)) => 130 | value match { 131 | case null => dfWriter 132 | case v => dfWriter.option(key, v.toString) 133 | } 134 | } 135 | } 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/v2/ExcelHeaderChecker.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.internal.Logging 20 | import org.apache.spark.sql.internal.SQLConf 21 | import org.apache.spark.sql.types.StructType 22 | 23 | /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case 24 | * sensitivity. 25 | * 26 | * @param schema 27 | * provided (or inferred) schema to which Excel must conform. 28 | * @param options 29 | * parsed Excel options. 30 | * @param source 31 | * name of Excel source that are currently checked. It is used in error messages. 32 | */ 33 | class ExcelHeaderChecker(schema: StructType, options: ExcelOptions, source: String) extends Logging { 34 | 35 | /** Indicates if it is set to `false`, comparison of column names and schema field names is not case sensitive. 36 | */ 37 | private val caseSensitive = SQLConf.get.caseSensitiveAnalysis 38 | 39 | /** Indicates if it is `true`, column names are ignored otherwise the Excel column names are checked for conformance 40 | * to the schema. In the case if the column name don't conform to the schema, an exception is thrown. 41 | */ 42 | private val enforceSchema = options.enforceSchema 43 | 44 | /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case 45 | * sensitivity. 46 | * 47 | * @param columnNames 48 | * names of Excel columns that must be checked against to the schema. 49 | */ 50 | def checkHeaderColumnNames(columnNames: Vector[String]): Unit = { 51 | if (columnNames != null) { 52 | val fieldNames = schema.map(_.name).toIndexedSeq 53 | val (headerLen, schemaSize) = (columnNames.size, fieldNames.length) 54 | var errorMessage: Option[String] = None 55 | 56 | if (headerLen == schemaSize) { 57 | var i = 0 58 | while (errorMessage.isEmpty && i < headerLen) { 59 | var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i)) 60 | if (!caseSensitive) { 61 | // scalastyle:off caselocale 62 | nameInSchema = nameInSchema.toLowerCase 63 | nameInHeader = nameInHeader.toLowerCase 64 | // scalastyle:on caselocale 65 | } 66 | if (nameInHeader != nameInSchema) { 67 | errorMessage = Some(s"""|Excel header does not conform to the schema. 68 | | Header: ${columnNames.mkString(", ")} 69 | | Schema: ${fieldNames.mkString(", ")} 70 | |Expected: ${fieldNames(i)} but found: ${columnNames(i)} 71 | |$source""".stripMargin) 72 | } 73 | i += 1 74 | } 75 | } else { 76 | errorMessage = Some(s"""|Number of column in Excel header is not equal to number of fields in the schema: 77 | | Header length: $headerLen, schema size: $schemaSize 78 | |$source""".stripMargin) 79 | } 80 | 81 | errorMessage.foreach { msg => 82 | if (enforceSchema) { logWarning(msg) } 83 | else { throw new IllegalArgumentException(msg) } 84 | } 85 | } 86 | } 87 | 88 | } 89 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/v2/ExcelOptionsTrait.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql.catalyst.util.{ 20 | CaseInsensitiveMap, 21 | DateFormatter, 22 | DateTimeUtils, 23 | ParseMode, 24 | PermissiveMode, 25 | TimestampFormatter 26 | } 27 | 28 | import java.time.ZoneId 29 | import java.util.Locale 30 | import scala.annotation.nowarn 31 | 32 | trait ExcelOptionsTrait extends Serializable { 33 | 34 | val parameters: CaseInsensitiveMap[String] 35 | val defaultTimeZoneId: String 36 | val defaultColumnNameOfCorruptRecord: String 37 | 38 | private def getInt(paramName: String): Option[Int] = { 39 | val paramValue = parameters.get(paramName) 40 | paramValue match { 41 | case None => None 42 | case Some(null) => None 43 | case Some(value) => 44 | try { 45 | Some(value.toInt) 46 | } catch { 47 | case _: NumberFormatException => 48 | throw new RuntimeException(s"$paramName should be an integer. Found $value") 49 | } 50 | } 51 | } 52 | 53 | private def getBool(paramName: String, default: Boolean): Boolean = { 54 | val param = parameters.getOrElse(paramName, default.toString) 55 | if (param == null) { 56 | default 57 | } else if (param.toLowerCase(Locale.ROOT) == "true") { 58 | true 59 | } else if (param.toLowerCase(Locale.ROOT) == "false") { 60 | false 61 | } else { 62 | throw new Exception(s"$paramName flag can be true or false") 63 | } 64 | } 65 | 66 | /* Parsing mode, how to handle corrupted record. Default to permissive */ 67 | val parseMode: ParseMode = parameters 68 | .get("mode") 69 | .map(ParseMode.fromString) 70 | .getOrElse(PermissiveMode) 71 | 72 | val zoneId: ZoneId = ZoneId 73 | .of(parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId)) 74 | 75 | /* A language tag in IETF BCP 47 format */ 76 | val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US) 77 | 78 | val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern) 79 | 80 | @nowarn 81 | val timestampFormat: String = parameters.getOrElse("timestampFormat", TimestampFormatter.defaultPattern) 82 | 83 | /* Have header line when reading and writing */ 84 | val header = getBool("header", default = true) 85 | 86 | /* Number of rows to ignore after header. Only in reading */ 87 | val ignoreAfterHeader = getInt("ignoreAfterHeader").getOrElse(0) 88 | 89 | val inferSchema = getBool("inferSchema", default = false) 90 | val excerptSize = getInt("excerptSize") 91 | 92 | /** Forcibly apply the specified or inferred schema to data files. If the option is enabled, headers of ABC files will 93 | * be ignored. 94 | */ 95 | val enforceSchema = getBool("enforceSchema", default = true) 96 | 97 | /* Name for column of corrupted records */ 98 | val columnNameOfCorruptRecord = parameters 99 | .getOrElse("columnNameOfCorruptRecord", defaultColumnNameOfCorruptRecord) 100 | 101 | val nullValue = parameters.getOrElse("nullValue", "") 102 | val nanValue = parameters.getOrElse("nanValue", "NaN") 103 | val positiveInf = parameters.getOrElse("positiveInf", "Inf") 104 | val negativeInf = parameters.getOrElse("negativeInf", "-Inf") 105 | 106 | /* If true, format the cells without rounding and scientific notations */ 107 | val usePlainNumberFormat = getBool("usePlainNumberFormat", default = false) 108 | 109 | /* If true, keep undefined (Excel) rows */ 110 | val keepUndefinedRows = getBool("keepUndefinedRows", default = false) 111 | 112 | /* Use null value for error cells */ 113 | val useNullForErrorCells = getBool("useNullForErrorCells", default = false) 114 | 115 | /* Additional column for color */ 116 | val addColorColumns = getBool("addColorColumns", default = false) 117 | val ignoreLeadingWhiteSpace = getBool("ignoreLeadingWhiteSpace", default = false) 118 | val ignoreTrailingWhiteSpace = getBool("ignoreTrailingWhiteSpace", default = false) 119 | 120 | /* Additional column for excel row number */ 121 | val columnNameOfRowNumber = parameters.get("columnNameOfRowNumber") 122 | 123 | /* Data address, default to everything */ 124 | val dataAddress = parameters.getOrElse("dataAddress", "A1") 125 | 126 | /* Workbook password, optional */ 127 | val workbookPassword = parameters.get("workbookPassword") 128 | 129 | /* Output excel file extension, default to xlsx */ 130 | val fileExtension = parameters.get("fileExtension") match { 131 | case Some(value) => value.trim 132 | case None => "xlsx" 133 | } 134 | 135 | /* Defines fraction of file used for schema inferring. For default and 136 | invalid values, 1.0 will be used */ 137 | val samplingRatio = { 138 | val r = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0) 139 | if (r > 1.0 || r <= 0.0) 1.0 else r 140 | } 141 | 142 | /** Optional parameter for using a streaming reader which can help with big files (will fail if used with xls format 143 | * files) 144 | */ 145 | val maxRowsInMemory = getInt("maxRowsInMemory") 146 | 147 | // scalastyle:off 148 | /** Optional parameter for <a 149 | * href="https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int-">maxByteArraySize</a> 150 | */ 151 | val maxByteArraySize = getInt("maxByteArraySize") 152 | 153 | // scalastyle:on 154 | /** Optional parameter for specifying the number of bytes at which a zip entry is regarded as too large for holding in 155 | * memory and the data is put in a temp file instead - useful for sheets with a lot of data 156 | */ 157 | val tempFileThreshold = getInt("tempFileThreshold") 158 | 159 | // scalastyle:on 160 | /** Optional parameter to specify whether the sheet name in dataAddress is a regex (for loading multiple sheets at 161 | * once) or the true sheet name 162 | */ 163 | val sheetNameIsRegex = getBool("sheetNameIsRegex", false) 164 | } 165 | -------------------------------------------------------------------------------- /src/main/scala/dev/mauch/spark/excel/v2/SheetData.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import java.io.Closeable 20 | 21 | case class SheetData[T](rowIterator: Iterator[T], resourcesToClose: Seq[Closeable] = Seq.empty) extends Closeable { 22 | def modifyIterator(f: Iterator[T] => Iterator[T]): SheetData[T] = SheetData(f(rowIterator), resourcesToClose) 23 | def append(other: SheetData[T]): SheetData[T] = 24 | SheetData(rowIterator ++ other.rowIterator, resourcesToClose ++ other.resourcesToClose) 25 | override def close(): Unit = resourcesToClose.foreach(_.close()) 26 | } 27 | -------------------------------------------------------------------------------- /src/test/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | # config for log4j 1.x (spark < 3.3) 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.out 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 7 | 8 | # config for log4j 2.x (spark >= 3.3) 9 | # Extra logging related to initialization of Log4j 10 | # Set to debug or trace if log4j initialization is failing 11 | status = warn 12 | 13 | 14 | # Console appender configuration 15 | appender.console.type = Console 16 | appender.console.name = consoleLogger 17 | appender.console.layout.type = PatternLayout 18 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n 19 | 20 | # Root logger level 21 | rootLogger.level = warn 22 | # Root logger referring to console appender 23 | rootLogger.appenderRef.stdout.ref = consoleLogger 24 | -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/Issue_747_plain_number.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/Issue_747_plain_number.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/apache_poi/57231_MixedGasReport.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/apache_poi/57231_MixedGasReport.xls -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/apache_poi/DataTableCities.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/apache_poi/DataTableCities.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=1/ca_03.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=1/ca_03.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_04.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_04.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_05.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_05.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_06.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_06.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_07.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_07.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_08.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_08.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_09.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_09.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_10.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_10.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_11.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_11.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_12.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_12.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/infer_stricter_numerical_types.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/infer_stricter_numerical_types.xls -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/infer_stricter_numerical_types.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/infer_stricter_numerical_types.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_162_nihar_gharat.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_162_nihar_gharat.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_285_bryce21.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_285_bryce21.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_463_cristichircu.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_463_cristichircu.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_942_sheetname_digits.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_942_sheetname_digits.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_944_faulty_dimension.md: -------------------------------------------------------------------------------- 1 | The issue_944_faulty_dimension.xlsx file contains `<dimension>` tags on each sheet, that does not conform to the true / physical size of the sheets (e.g. `<dimension ref="A1"/>` instead of `<dimension ref="A1:E2"/>` for sheet1). 2 | 3 | It was fabricated by hand and is used to test the library's ability to handle such cases. 4 | 5 | This is how the file was created: 6 | * take a valid excel file 7 | * rename extension from xlsx to zip 8 | * unzip it 9 | * patch the `<dimension>` tags in `xl/worksheets/sheet1.xml` and `xl/worksheets/sheet2.xml` 10 | * zip it back 11 | * rename extension back to xlsx 12 | -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_944_faulty_dimension.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_944_faulty_dimension.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_965_blank_rows.md: -------------------------------------------------------------------------------- 1 | The issue_965_blank_rows.xlsx is used to test that rows containing no values are discarded if read with keepUndefinedRows == False. 2 | 3 | The Excel was fabricated by hand and is used to test the library's ability to handle such cases. 4 | 5 | This is how the file was created: 6 | * take a valid excel file 7 | * rename extension from xlsx to zip 8 | * unzip it 9 | * add empty row definitions to `xl/worksheets/sheet1.xml` (see) below) 10 | * zip it back 11 | * rename extension back to xlsx 12 | 13 | 14 | The empty row definitions added to the file are as follows: 15 | ```xml 16 | <row r="5" spans="1:7" x14ac:dyDescent="0.25"> 17 | <c r="A5" s="1"/> 18 | <c r="B5" s="1"/> 19 | </row> 20 | ```` -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/issue_965_blank_rows.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_965_blank_rows.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/plain_number.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/plain_number.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/read_multiple_sheets_at_once.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/read_multiple_sheets_at_once.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/read_multiple_sheets_at_once_noheader.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/read_multiple_sheets_at_once_noheader.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/simple_encrypted.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/simple_encrypted.xls -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/simple_encrypted.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/simple_encrypted.xlsx -------------------------------------------------------------------------------- /src/test/resources/spreadsheets/with_errors_all_types.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/with_errors_all_types.xlsx -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/DataFrameSuiteBase.scala: -------------------------------------------------------------------------------- 1 | package dev.mauch.spark 2 | 3 | import com.github.mrpowers.spark.fast.tests.DataFrameComparer 4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession} 5 | 6 | import java.sql.Timestamp 7 | 8 | trait DataFrameSuiteBase extends DataFrameComparer { 9 | 10 | lazy val spark: SparkSession = SparkSession 11 | .builder() 12 | .master("local") 13 | .appName("spark-excel session") 14 | .config("spark.sql.shuffle.partitions", "1") 15 | .getOrCreate() 16 | 17 | def assertDataFrameEquals(df1: DataFrame, df2: DataFrame): Unit = 18 | assertSmallDataFrameEquality(df1, df2) 19 | 20 | def assertDataFrameApproximateEquals(expectedDF: DataFrame, actualDF: DataFrame, relTol: Double): Unit = { 21 | val e = (r1: Row, r2: Row) => { 22 | r1.equals(r2) || RelTolComparer.areRowsEqual(r1, r2, relTol) 23 | } 24 | assertLargeDatasetEquality[Row]( 25 | actualDF, 26 | expectedDF, 27 | equals = e, 28 | ignoreNullable = false, 29 | ignoreColumnNames = false, 30 | orderedComparison = false 31 | ) 32 | } 33 | 34 | def assertDataFrameNoOrderEquals(df1: DataFrame, df2: DataFrame): Unit = 35 | assertSmallDataFrameEquality(df1, df2, orderedComparison = false) 36 | } 37 | 38 | object RelTolComparer { 39 | 40 | trait ToNumeric[T] { 41 | def toNumeric(x: Double): T 42 | } 43 | object ToNumeric { 44 | implicit val doubleToDouble: ToNumeric[Double] = new ToNumeric[Double] { 45 | def toNumeric(x: Double): Double = x 46 | } 47 | implicit val doubleToFloat: ToNumeric[Float] = new ToNumeric[Float] { 48 | def toNumeric(x: Double): Float = x.toFloat 49 | } 50 | implicit val doubleToLong: ToNumeric[Long] = new ToNumeric[Long] { 51 | def toNumeric(x: Double): Long = x.toLong 52 | } 53 | implicit val doubleToBigDecimal: ToNumeric[BigDecimal] = new ToNumeric[BigDecimal] { 54 | def toNumeric(x: Double): BigDecimal = BigDecimal(x) 55 | } 56 | } 57 | 58 | /** Approximate equality, based on equals from [[Row]] */ 59 | def areRowsEqual(r1: Row, r2: Row, relTol: Double): Boolean = { 60 | def withinRelTol[T : Numeric : ToNumeric](a: T, b: T): Boolean = { 61 | val num = implicitly[Numeric[T]] 62 | val toNum = implicitly[ToNumeric[T]] 63 | val absTol = num.times(toNum.toNumeric(relTol), num.max(num.abs(a), num.abs(b))) 64 | val diff = num.abs(num.minus(a, b)) 65 | num.lteq(diff, absTol) 66 | } 67 | 68 | if (r1.length != r2.length) { 69 | return false 70 | } else { 71 | (0 until r1.length).foreach(idx => { 72 | if (r1.isNullAt(idx) != r2.isNullAt(idx)) { 73 | return false 74 | } 75 | 76 | if (!r1.isNullAt(idx)) { 77 | val o1 = r1.get(idx) 78 | val o2 = r2.get(idx) 79 | o1 match { 80 | case b1: Array[Byte] => 81 | if (!java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) { 82 | return false 83 | } 84 | 85 | case f1: Float => 86 | if ( 87 | java.lang.Float.isNaN(f1) != 88 | java.lang.Float.isNaN(o2.asInstanceOf[Float]) 89 | ) { 90 | return false 91 | } 92 | if (!withinRelTol[Float](f1, o2.asInstanceOf[Float])) { 93 | return false 94 | } 95 | 96 | case d1: Double => 97 | if ( 98 | java.lang.Double.isNaN(d1) != 99 | java.lang.Double.isNaN(o2.asInstanceOf[Double]) 100 | ) { 101 | return false 102 | } 103 | if (!withinRelTol[Double](d1, o2.asInstanceOf[Double])) { 104 | return false 105 | } 106 | 107 | case d1: java.math.BigDecimal => 108 | if (!withinRelTol(BigDecimal(d1), BigDecimal(o2.asInstanceOf[java.math.BigDecimal]))) { 109 | return false 110 | } 111 | 112 | case t1: Timestamp => 113 | if (!withinRelTol(t1.getTime, o2.asInstanceOf[Timestamp].getTime)) { 114 | return false 115 | } 116 | 117 | case _ => 118 | if (o1 != o2) return false 119 | } 120 | } 121 | }) 122 | } 123 | true 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/DataLocatorSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.poi.xssf.usermodel.XSSFWorkbook 20 | import org.scalacheck.Gen 21 | import org.scalatest.funspec.AnyFunSpec 22 | import org.scalatest.matchers.should.Matchers 23 | import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks 24 | import spoiwo.model.Workbook 25 | import spoiwo.natures.xlsx.Model2XlsxConversions._ 26 | 27 | import scala.jdk.CollectionConverters._ 28 | import scala.collection.compat._ 29 | 30 | class DataLocatorSuite extends AnyFunSpec with ScalaCheckPropertyChecks with Matchers with Generators { 31 | describe("with a table reference") { 32 | val dl = DataLocator(Map("dataAddress" -> s"$tableName[#All]")) 33 | describe("containing #All") { 34 | it("extracts the entire table data") { 35 | forAll(sheetWithTableGen) { sheet => 36 | val actualData = dl.readFrom(sheet.convertAsXlsx()).map(_.map(_.value)).to(Seq) 37 | actualData should contain theSameElementsAs sheet.extractTableData(0) 38 | } 39 | } 40 | 41 | it("writes into a new table in a new sheet if no corresponding table exists") { 42 | forAll(sheetGenerator(withHeader = Gen.const(true), numCols = Gen.choose(1, 200))) { dataSheet => 43 | val workbook = new XSSFWorkbook() 44 | val header = dataSheet.rows.head.cells.map(_.value.toString).toSeq 45 | val generatedSheet = dl.toSheet( 46 | header = Some(header), 47 | data = dataSheet.rows.tail.iterator.map(_.cells.map(_.value.toString).toSeq), 48 | existingWorkbook = workbook 49 | ) 50 | generatedSheet.convertAsXlsx(workbook) 51 | val pTable = workbook.getTable(tableName) 52 | pTable.getSheetName should equal(tableName) 53 | pTable.getColumns.asScala.map(_.getName) should contain theSameElementsInOrderAs header 54 | val actualData = dl.readFrom(workbook).map(_.map(_.value)).to(Seq) 55 | actualData should contain theSameElementsAs dataSheet.rows.map(_.cells.map(_.value)) 56 | } 57 | } 58 | 59 | it("overwrites an existing table") { 60 | forAll(sheetWithTableGen) { sheetWithTable => 61 | val workbook = sheetWithTable.convertAsXlsx() 62 | val table = sheetWithTable.tables.head 63 | val header = table.columns.map(_.name) 64 | val tableData = dl.readFrom(workbook).map(_.map(c => s"new_$c")).toList 65 | val generatedSheet = 66 | dl.toSheet(header = tableData.headOption, data = tableData.iterator.drop(1), existingWorkbook = workbook) 67 | Workbook(generatedSheet).writeToExisting(workbook) 68 | val pTable = workbook.getTable(tableName) 69 | pTable.getSheetName should equal(sheetName) 70 | pTable.getColumns.asScala.map(_.getName) should contain theSameElementsInOrderAs header 71 | val actualData = dl.readFrom(workbook).map(_.map(_.value)).to(Seq) 72 | actualData should contain theSameElementsAs tableData 73 | } 74 | } 75 | } 76 | } 77 | describe("without any dataAddress") { 78 | it("defaults to starting at cell A1 in the first sheet") { 79 | val dl = DataLocator(Map()) 80 | dl shouldBe a[CellRangeAddressDataLocator] 81 | val cradl = dl.asInstanceOf[CellRangeAddressDataLocator] 82 | cradl.dataAddress.getFirstCell.formatAsString() should equal("A1") 83 | cradl.dataAddress.getFirstCell.getSheetName should equal(null) 84 | } 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/EncryptedReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.spark.sql._ 20 | import org.apache.spark.sql.types._ 21 | 22 | import dev.mauch.spark.DataFrameSuiteBase 23 | import org.scalatest.funspec.AnyFunSpec 24 | import org.scalatest.matchers.should.Matchers 25 | import scala.jdk.CollectionConverters._ 26 | 27 | object EncryptedReadSuite { 28 | val simpleSchema = StructType( 29 | List( 30 | StructField("A", DoubleType, true), 31 | StructField("B", DoubleType, true), 32 | StructField("C", DoubleType, true), 33 | StructField("D", DoubleType, true) 34 | ) 35 | ) 36 | 37 | val expectedData = List(Row(1.0d, 2.0d, 3.0d, 4.0d)).asJava 38 | } 39 | 40 | class EncryptedReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers { 41 | import EncryptedReadSuite._ 42 | 43 | lazy val expected = spark.createDataFrame(expectedData, simpleSchema) 44 | 45 | def readFromResources(path: String, password: String, maxRowsInMemory: Option[Int] = None): DataFrame = { 46 | val url = getClass.getResource(path) 47 | val reader = spark.read 48 | .excel( 49 | dataAddress = s"Sheet1!A1", 50 | treatEmptyValuesAsNulls = true, 51 | workbookPassword = password, 52 | inferSchema = true 53 | ) 54 | val withMaxRows = maxRowsInMemory.fold(reader)(rows => reader.option("maxRowsInMemory", s"$rows")) 55 | withMaxRows.load(url.getPath) 56 | } 57 | 58 | describe("spark-excel") { 59 | it("should read encrypted xslx file") { 60 | val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba") 61 | 62 | assertDataFrameEquals(expected, df) 63 | } 64 | 65 | it("should read encrypted xlsx file with maxRowsInMem=10") { 66 | val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba", maxRowsInMemory = Some(10)) 67 | 68 | assertDataFrameEquals(expected, df) 69 | } 70 | 71 | it("should read encrypted xlsx file with maxRowsInMem=1") { 72 | val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba", maxRowsInMemory = Some(1)) 73 | 74 | assertDataFrameEquals(expected, df) 75 | } 76 | 77 | it("should read encrypted xls file") { 78 | val df = readFromResources("/spreadsheets/simple_encrypted.xls", "fooba") 79 | 80 | assertDataFrameEquals(expected, df) 81 | } 82 | } 83 | } 84 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/ErrorsAsStringsReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.types._ 21 | import org.apache.spark.sql.{Row, _} 22 | import org.scalatest.funspec.AnyFunSpec 23 | import org.scalatest.matchers.should.Matchers 24 | 25 | import java.sql.Timestamp 26 | import java.time.LocalDateTime 27 | import java.util 28 | import scala.jdk.CollectionConverters._ 29 | 30 | object ErrorsAsStringsReadSuite { 31 | private val dummyTimestamp = Timestamp.valueOf(LocalDateTime.of(2021, 2, 19, 0, 0)) 32 | private val epochTimestamp = new Timestamp(0) 33 | private val dummyText = "hello" 34 | 35 | private val expectedSchemaInfer = StructType( 36 | List( 37 | StructField("double", DoubleType, true), 38 | StructField("boolean", BooleanType, true), 39 | StructField("timestamp", TimestampType, true), 40 | StructField("string", StringType, true), 41 | StructField("formula", StringType, true) 42 | ) 43 | ) 44 | private val expectedDataErrorsAsStringsInfer: util.List[Row] = 45 | List( 46 | Row(1.0, true, dummyTimestamp, dummyText, "A1"), 47 | Row(2.0, false, dummyTimestamp, dummyText, "A3"), 48 | Row(0.0, false, epochTimestamp, "", ""), 49 | Row(0.0, false, epochTimestamp, "", "") 50 | ).asJava 51 | 52 | private val expectedDataErrorsAsNullInfer: util.List[Row] = 53 | List( 54 | Row(1.0, true, dummyTimestamp, dummyText, "A1"), 55 | Row(2.0, false, dummyTimestamp, dummyText, "A3"), 56 | Row(null, null, null, null, null), 57 | Row(null, null, null, null, null) 58 | ).asJava 59 | 60 | private val expectedSchemaNonInfer = StructType( 61 | List( 62 | StructField("double", StringType, true), 63 | StructField("boolean", StringType, true), 64 | StructField("timestamp", StringType, true), 65 | StructField("string", StringType, true), 66 | StructField("formula", StringType, true) 67 | ) 68 | ) 69 | private val expectedDataErrorsAsStringsNonInfer: util.List[Row] = 70 | List( 71 | Row("1", "TRUE", "19\"-\"Feb\"-\"2021", dummyText, "A1"), 72 | Row("2", "FALSE", "19\"-\"Feb\"-\"2021", dummyText, "A3"), 73 | Row("", "", "", "", ""), 74 | Row("", "", "", "", "") 75 | ).asJava 76 | 77 | private val expectedDataErrorsAsNullNonInfer: util.List[Row] = 78 | List( 79 | Row("1", "TRUE", "19\"-\"Feb\"-\"2021", "hello", "A1"), 80 | Row("2", "FALSE", "19\"-\"Feb\"-\"2021", "hello", "A3"), 81 | Row(null, null, null, null, null), 82 | Row(null, null, null, null, null) 83 | ).asJava 84 | 85 | private val excelLocation = "/spreadsheets/with_errors_all_types.xlsx" 86 | } 87 | 88 | class ErrorsAsStringsReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers { 89 | import ErrorsAsStringsReadSuite._ 90 | 91 | def readFromResources(path: String, setErrorCellsToFallbackValues: Boolean, inferSchema: Boolean): DataFrame = { 92 | val url = getClass.getResource(path) 93 | spark.read 94 | .excel(setErrorCellsToFallbackValues = setErrorCellsToFallbackValues, inferSchema = inferSchema, excerptSize = 3) 95 | .load(url.getPath) 96 | } 97 | 98 | describe("spark-excel") { 99 | it("should read errors in string format when setErrorCellsToFallbackValues=true and inferSchema=true") { 100 | val df = readFromResources(excelLocation, true, true) 101 | val expected = spark.createDataFrame(expectedDataErrorsAsStringsInfer, expectedSchemaInfer) 102 | assertDataFrameEquals(expected, df) 103 | } 104 | 105 | it("should read errors as null when setErrorCellsToFallbackValues=false and inferSchema=true") { 106 | val df = readFromResources(excelLocation, false, true) 107 | val expected = spark.createDataFrame(expectedDataErrorsAsNullInfer, expectedSchemaInfer) 108 | assertDataFrameEquals(expected, df) 109 | } 110 | 111 | it("should read errors in string format when setErrorCellsToFallbackValues=true and inferSchema=false") { 112 | val df = readFromResources(excelLocation, true, false) 113 | val expected = spark.createDataFrame(expectedDataErrorsAsStringsNonInfer, expectedSchemaNonInfer) 114 | assertDataFrameEquals(expected, df) 115 | } 116 | 117 | it("should read errors in string format when setErrorCellsToFallbackValues=false and inferSchema=false") { 118 | val df = readFromResources(excelLocation, false, false) 119 | val expected = spark.createDataFrame(expectedDataErrorsAsNullNonInfer, expectedSchemaNonInfer) 120 | assertDataFrameEquals(expected, df) 121 | } 122 | } 123 | } 124 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/RichRowSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel 18 | 19 | import org.apache.poi.ss.usermodel.{Cell, Row} 20 | import org.scalacheck.Gen 21 | import org.scalacheck.Prop.propBoolean 22 | import org.scalamock.scalatest.MockFactory 23 | 24 | import scala.util.Try 25 | import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks 26 | import org.scalatest.funsuite.AnyFunSuite 27 | 28 | trait RowGenerator extends MockFactory { 29 | private val MAX_WIDTH = 100 30 | 31 | protected case class GeneratedRow(start: Int, end: Int, lastCellNum: Int, row: Row) 32 | 33 | protected val rowGen: Gen[GeneratedRow] = for { 34 | startColumn <- Gen.choose(0, MAX_WIDTH - 1) 35 | endColumn <- Gen.choose(0, MAX_WIDTH - 1) 36 | lastCellNum <- Gen.choose(0, MAX_WIDTH - 1) 37 | row = stub[Row] 38 | _ = (row.getCell(_: Int)).when(*) returns stub[Cell] 39 | _ = (row.getLastCellNum _).when() returns lastCellNum.toShort 40 | } yield GeneratedRow(startColumn, endColumn, lastCellNum, row) 41 | } 42 | 43 | class RichRowSuite extends AnyFunSuite with ScalaCheckPropertyChecks with RowGenerator { 44 | test("Invalid cell range should throw an error") { 45 | forAll(rowGen) { g => 46 | (g.start > g.end) ==> Try { 47 | g.row.eachCellIterator(g.start, g.end).next() 48 | }.isFailure 49 | } 50 | } 51 | 52 | test("Valid cell range should iterate through all non-empty cells") { 53 | forAll(rowGen) { g => 54 | (g.start <= g.end && g.start < g.lastCellNum) ==> { 55 | val count = g.row.eachCellIterator(g.start, g.end).size 56 | count === Math.min(g.end, g.lastCellNum - 1) - g.start + 1 57 | } 58 | } 59 | } 60 | 61 | test("Valid cell range should should not iterate through non-empty cells") { 62 | forAll(rowGen) { g => 63 | (g.start <= g.end && g.start >= g.lastCellNum) ==> { 64 | g.row.eachCellIterator(g.start, g.end).size === 0 65 | } 66 | } 67 | } 68 | } 69 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/AreaReferenceReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.util 25 | import scala.jdk.CollectionConverters._ 26 | 27 | /** Loading data from difference data address (AreaReference) 28 | */ 29 | object AreaReferenceReadSuite { 30 | val expectedSchema_01 = StructType( 31 | List( 32 | StructField("Translations!$A$370", StringType, true), 33 | StructField("Translations!$A$371", LongType, true), 34 | StructField("Translations!$A$402", DoubleType, true), 35 | StructField("Translations!$A$393", DoubleType, true), 36 | StructField("Translations!$A$384", DoubleType, true), 37 | StructField("Translations!$A$405", DoubleType, true), 38 | StructField("Translations!$A$396", DoubleType, true), 39 | StructField("Translations!$A$387", DoubleType, true), 40 | StructField("Translations!$A$418", DoubleType, true), 41 | StructField("Translations!$A$419", DoubleType, true), 42 | StructField("Translations!$A$4110", DoubleType, true) 43 | ) 44 | ) 45 | 46 | /* Manually checking 1 row only */ 47 | val expectedData_01: util.List[Row] = List( 48 | Row( 49 | "Alabama", 50 | 140895441L, 51 | 458d, 52 | 122d, 53 | 85116d, 54 | 1009700176.36684d, 55 | 268959435.626102d, 56 | 187645502645.503d, 57 | 0.0072d, 58 | 0.0019d, 59 | 1.3318d 60 | ) 61 | ).asJava 62 | 63 | } 64 | 65 | class AreaReferenceReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 66 | import AreaReferenceReadSuite._ 67 | 68 | test("AreaReference from diffrence sheet with testing data from Apache POI upstream tests") { 69 | val df = readFromResources( 70 | spark, 71 | path = "apache_poi/57231_MixedGasReport.xls", 72 | options = Map("dataAddress" -> "'Coefficient Table'!A6", "ignoreAfterHeader" -> 2, "inferSchema" -> true) 73 | ).limit(1) 74 | val expected = spark.createDataFrame(expectedData_01, expectedSchema_01) 75 | assertDataFrameApproximateEquals(expected, df, 0.1e-1) 76 | } 77 | 78 | } 79 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/DataFrameWriterApiComplianceSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql._ 21 | import org.scalatest.wordspec.AnyWordSpec 22 | 23 | class DataFrameWriterApiComplianceSuite extends AnyWordSpec with DataFrameSuiteBase with LocalFileTestingUtilities { 24 | 25 | private def simpleDf = { 26 | val data = Seq(("foo", "bar", "1"), ("baz", "bang", "2")) 27 | spark.createDataFrame(data).toDF("col1", "col2", "col3") 28 | } 29 | 30 | /** Checks that the excel data files in given folder equal the provided dataframe */ 31 | private def assertWrittenExcelData(expectedDf: DataFrame, folder: String): Unit = { 32 | val actualDf = spark.read 33 | .format("excel") 34 | .option("path", folder) 35 | .load() 36 | 37 | /* assertDataFrameNoOrderEquals is sensitive to order of columns, so we 38 | order both dataframes in the same way 39 | */ 40 | val orderedSchemaColumns = expectedDf.schema.fields.map(f => f.name).sorted 41 | 42 | assertDataFrameNoOrderEquals( 43 | expectedDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*), 44 | actualDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*) 45 | ) 46 | 47 | } 48 | "excel v2 complies to DataFrameWriter SaveMode and Partitioning behavior" can { 49 | 50 | val writeModes = Seq(SaveMode.Overwrite, SaveMode.Append) 51 | 52 | for (writeMode <- writeModes) { 53 | s"write a dataframe to xlsx with ${writeMode.toString}" in withExistingCleanTempDir("v2") { targetDir => 54 | // create a df from csv then write as xlsx 55 | val df = simpleDf 56 | 57 | df.write 58 | .format("excel") 59 | .option("path", targetDir) 60 | .option("header", value = true) 61 | .mode(writeMode) 62 | .save() 63 | 64 | val listOfFiles = getListOfFilesFilteredByExtension(targetDir, "xlsx") 65 | assert(listOfFiles.nonEmpty, s"expected at least one excel file") 66 | 67 | // is the result really the same? 68 | assertWrittenExcelData(df, targetDir) 69 | 70 | } 71 | s"write a dataframe to xlsx with ${writeMode.toString} (partitioned)" in withExistingCleanTempDir("v2") { 72 | targetDir => 73 | assume(spark.sparkContext.version >= "3.0.1") 74 | // create a df from csv then write as xlsx 75 | val df = simpleDf 76 | 77 | df.write 78 | .partitionBy("col1") 79 | .format("excel") 80 | .option("path", targetDir) 81 | .option("header", value = true) 82 | .mode(writeMode) 83 | .save() 84 | 85 | // some file based checks 86 | val listOfFolders = getListOfFolders(targetDir) 87 | assert(listOfFolders.length == 2, s"expected two folders because there are two partitions") 88 | for (folder <- listOfFolders) { 89 | assert(folder.getName.startsWith("col1="), s"expected partition folders and those must start with col1=") 90 | val listOfFiles = getListOfFilesFilteredByExtension(folder.getAbsolutePath, "xlsx") 91 | assert(listOfFiles.nonEmpty, s"expected at least one xlsx per folder but got $listOfFiles") 92 | } 93 | 94 | // is the result really the same? 95 | assertWrittenExcelData(df, targetDir) 96 | 97 | } 98 | } 99 | 100 | for (isPartitioned <- Seq(false, true)) { 101 | s"multiple appends to folder (partitioned == $isPartitioned)" in withExistingCleanTempDir("v2") { targetDir => 102 | if (isPartitioned) { 103 | assume(spark.sparkContext.version >= "3.0.1") 104 | } 105 | 106 | val df = simpleDf 107 | 108 | val dfWriter = if (isPartitioned) df.write else df.write.partitionBy("col1") 109 | 110 | dfWriter 111 | .format("excel") 112 | .option("path", targetDir) 113 | .option("header", value = true) 114 | .mode(SaveMode.Append) 115 | .save() 116 | dfWriter 117 | .format("excel") 118 | .option("path", targetDir) 119 | .option("header", value = true) 120 | .mode(SaveMode.Append) 121 | .save() 122 | 123 | val orderedSchemaColumns = df.schema.fields.map(f => f.name).sorted 124 | val expectedDf = 125 | df.union(df).select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*) 126 | 127 | assertWrittenExcelData(expectedDf, targetDir) 128 | } 129 | } 130 | } 131 | } 132 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/EncryptedReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql._ 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import scala.jdk.CollectionConverters._ 25 | 26 | object EncryptedReadSuite { 27 | val simpleSchema = StructType( 28 | List( 29 | StructField("A", IntegerType, true), 30 | StructField("B", IntegerType, true), 31 | StructField("C", IntegerType, true), 32 | StructField("D", IntegerType, true) 33 | ) 34 | ) 35 | 36 | val expectedData = List(Row(1, 2, 3, 4)).asJava 37 | } 38 | 39 | class EncryptedReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 40 | import EncryptedReadSuite._ 41 | 42 | lazy val expected = spark.createDataFrame(expectedData, simpleSchema) 43 | 44 | test("read encrypted xslx file") { 45 | val df = readFromResources( 46 | spark, 47 | path = "simple_encrypted.xlsx", 48 | options = Map( 49 | "dataAddress" -> "Sheet1!A1", 50 | "treatEmptyValuesAsNulls" -> true, 51 | "workbookPassword" -> "fooba", 52 | "inferSchema" -> true 53 | ) 54 | ) 55 | assertDataFrameEquals(expected, df) 56 | } 57 | 58 | test("read encrypted xslx file (maxRowsInMemory)") { 59 | val df = readFromResources( 60 | spark, 61 | path = "simple_encrypted.xlsx", 62 | options = Map( 63 | "dataAddress" -> "Sheet1!A1", 64 | "treatEmptyValuesAsNulls" -> true, 65 | "workbookPassword" -> "fooba", 66 | "maxRowsInMemory" -> 1, 67 | "inferSchema" -> true 68 | ) 69 | ) 70 | assertDataFrameEquals(expected, df) 71 | } 72 | 73 | test("read encrypted xls file") { 74 | val df = readFromResources( 75 | spark, 76 | path = "simple_encrypted.xls", 77 | options = Map( 78 | "dataAddress" -> "Sheet1!A1", 79 | "treatEmptyValuesAsNulls" -> true, 80 | "workbookPassword" -> "fooba", 81 | "inferSchema" -> true 82 | ) 83 | ) 84 | assertDataFrameEquals(expected, df) 85 | } 86 | } 87 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/ErrorsAsStringsReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.sql.Timestamp 25 | import java.time.LocalDateTime 26 | import java.util 27 | import scala.jdk.CollectionConverters._ 28 | 29 | object ErrorsAsStringsReadSuite { 30 | private val dummyTimestamp = Timestamp.valueOf(LocalDateTime.of(2021, 2, 19, 0, 0)) 31 | private val dummyText = "hello" 32 | 33 | private val expectedSchemaInfer = StructType( 34 | List( 35 | StructField("double", IntegerType, true), 36 | StructField("boolean", BooleanType, true), 37 | StructField("timestamp", TimestampType, true), 38 | StructField("string", StringType, true), 39 | StructField("formula", StringType, true) 40 | ) 41 | ) 42 | 43 | private val expectedDataErrorsAsNullInfer: util.List[Row] = List( 44 | Row(1, true, dummyTimestamp, dummyText, "A1"), 45 | Row(2, false, dummyTimestamp, dummyText, "A3"), 46 | Row(null, null, null, null, null), 47 | Row(null, null, null, null, null) 48 | ).asJava 49 | 50 | private val expectedDataErrorsAsStringsInfer: util.List[Row] = List( 51 | Row(1, true, dummyTimestamp, dummyText, "A1"), 52 | Row(2, false, dummyTimestamp, dummyText, "A3"), 53 | Row(null, null, null, "#NULL!", "#DIV/0!"), 54 | Row(null, null, null, "#N/A", "#NAME?") 55 | ).asJava 56 | 57 | private val expectedSchemaNonInfer = StructType( 58 | List( 59 | StructField("double", StringType, true), 60 | StructField("boolean", StringType, true), 61 | StructField("timestamp", StringType, true), 62 | StructField("string", StringType, true), 63 | StructField("formula", StringType, true) 64 | ) 65 | ) 66 | 67 | private val expectedDataErrorsAsNullNonInfer: util.List[Row] = List( 68 | Row("1", "TRUE", """19"-"Feb"-"2021""", "hello", "A1"), 69 | Row("2", "FALSE", """19"-"Feb"-"2021""", "hello", "A3"), 70 | Row(null, null, null, null, null), 71 | Row(null, null, null, null, null) 72 | ).asJava 73 | 74 | private val expectedDataErrorsAsStringsNonInfer: util.List[Row] = List( 75 | Row("1", "TRUE", """19"-"Feb"-"2021""", dummyText, "A1"), 76 | Row("2", "FALSE", """19"-"Feb"-"2021""", dummyText, "A3"), 77 | Row("#NULL!", "#NULL!", "#NULL!", "#NULL!", "#DIV/0!"), 78 | Row("#N/A", "#N/A", "#N/A", "#N/A", "#NAME?") 79 | ).asJava 80 | } 81 | 82 | /** Breaking change with V1: For Spark String Type field, Error Cell has an option to either get error value or null as 83 | * any other Spark Types 84 | * 85 | * Related issues: Support ERROR cell type when using inferSchema=true link: 86 | * https://github.dev/mauch/spark-excel/pull/343 87 | */ 88 | class ErrorsAsStringsReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 89 | import ErrorsAsStringsReadSuite._ 90 | 91 | test("error cells as null when useNullForErrorCells=true and inferSchema=true") { 92 | val df = readFromResources( 93 | spark, 94 | path = "with_errors_all_types.xlsx", 95 | options = Map("inferSchema" -> true, "useNullForErrorCells" -> true) 96 | ) 97 | val expected = spark.createDataFrame(expectedDataErrorsAsNullInfer, expectedSchemaInfer) 98 | assertDataFrameEquals(expected, df) 99 | } 100 | 101 | test("errors as null for non-string type with useNullForErrorCells=false and inferSchema=true") { 102 | val df = readFromResources( 103 | spark, 104 | path = "with_errors_all_types.xlsx", 105 | options = Map("inferSchema" -> true, "useNullForErrorCells" -> false) 106 | ) 107 | val expected = spark.createDataFrame(expectedDataErrorsAsStringsInfer, expectedSchemaInfer) 108 | assertDataFrameEquals(expected, df) 109 | } 110 | 111 | test("errors in string format when useNullForErrorCells=true and inferSchema=false") { 112 | val df = readFromResources( 113 | spark, 114 | path = "with_errors_all_types.xlsx", 115 | options = Map("inferSchema" -> false, "useNullForErrorCells" -> true) 116 | ) 117 | val expected = spark.createDataFrame(expectedDataErrorsAsNullNonInfer, expectedSchemaNonInfer) 118 | assertDataFrameEquals(expected, df) 119 | } 120 | 121 | test("errors in string format when useNullForErrorCells=false and inferSchema=false") { 122 | val df = readFromResources( 123 | spark, 124 | path = "with_errors_all_types.xlsx", 125 | options = Map("inferSchema" -> false, "useNullForErrorCells" -> false) 126 | ) 127 | val expected = spark 128 | .createDataFrame(expectedDataErrorsAsStringsNonInfer, expectedSchemaNonInfer) 129 | assertDataFrameEquals(expected, df) 130 | } 131 | 132 | } 133 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/ExcelTestingUtilities.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import org.apache.spark.sql._ 20 | import org.apache.spark.sql.types.StructType 21 | import scala.reflect.io.Directory 22 | import java.io.File 23 | 24 | trait ExcelTestingUtilities { 25 | 26 | private val dataRoot = getClass.getResource("/spreadsheets").getPath 27 | 28 | /** Load excel data from resource folder 29 | * 30 | * @param spark 31 | * spark session 32 | * @param path 33 | * relative path to the resource/speadsheets 34 | * @param options 35 | * extra loading option 36 | * @return 37 | * data frame 38 | */ 39 | def readFromResources(spark: SparkSession, path: String, options: Map[String, Any]): DataFrame = 40 | spark.read 41 | .format("excel") 42 | .options(options.map(p => (p._1 -> p._2.toString()))) 43 | .load(s"$dataRoot/$path") 44 | 45 | /** Load excel data from resource folder with user defined schema 46 | * 47 | * @param spark 48 | * spark session 49 | * @param path 50 | * relative path to the resource/speadsheets 51 | * @param options 52 | * extra loading option 53 | * @param schema 54 | * user provided schema 55 | * @return 56 | * data frame 57 | */ 58 | def readFromResources(spark: SparkSession, path: String, options: Map[String, Any], schema: StructType): DataFrame = 59 | spark.read 60 | .format("excel") 61 | .options(options.map(p => (p._1 -> p._2.toString()))) 62 | .schema(schema) 63 | .load(s"$dataRoot/$path") 64 | 65 | /** Delete directory recursively. Intended for temporary testing data only. Use with causion! 66 | * 67 | * @param path 68 | * to be deleted 69 | */ 70 | def deleteDirectory(path: String): Unit = { 71 | val directory = new Directory(new File(path)) 72 | directory.deleteRecursively() 73 | () 74 | } 75 | } 76 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/GlobPartitionAndFileNameSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.functions.input_file_name 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | /** Issue References: 25 | * 26 | * #52. input_file_name returns empty string https://github.dev/mauch/spark-excel/issues/52 27 | * 28 | * #74. Allow reading multiple files specified as a list OR by a pattern https://github.dev/mauch/spark-excel/issues/74 29 | * 30 | * #97. Reading multiple files https://github.dev/mauch/spark-excel/issues/97 31 | */ 32 | 33 | object GlobPartitionAndFileNameSuite { 34 | val expectedInferredSchema = StructType( 35 | List( 36 | StructField("Day", IntegerType, true), 37 | StructField("Month", IntegerType, true), 38 | StructField("Customer ID", StringType, true), 39 | StructField("Customer Name", StringType, true), 40 | StructField("Standard Package", IntegerType, true), 41 | StructField("Extra Option 1", IntegerType, true), 42 | StructField("Extra Option 2", IntegerType, true), 43 | StructField("Extra Option 3", IntegerType, true), 44 | StructField("Staff", StringType, true) 45 | ) 46 | ) 47 | 48 | val expectedWithFilenameSchema = StructType( 49 | List( 50 | StructField("Day", IntegerType, true), 51 | StructField("Month", IntegerType, true), 52 | StructField("Customer ID", StringType, true), 53 | StructField("Customer Name", StringType, true), 54 | StructField("Standard Package", IntegerType, true), 55 | StructField("Extra Option 1", IntegerType, true), 56 | StructField("Extra Option 2", IntegerType, true), 57 | StructField("Extra Option 3", IntegerType, true), 58 | StructField("Staff", StringType, true), 59 | StructField("file_name", StringType, false) 60 | ) 61 | ) 62 | 63 | val expectedWithPartitionSchema = StructType( 64 | List( 65 | StructField("Day", IntegerType, true), 66 | StructField("Month", IntegerType, true), 67 | StructField("Customer ID", StringType, true), 68 | StructField("Customer Name", StringType, true), 69 | StructField("Standard Package", IntegerType, true), 70 | StructField("Extra Option 1", IntegerType, true), 71 | StructField("Extra Option 2", IntegerType, true), 72 | StructField("Extra Option 3", IntegerType, true), 73 | StructField("Staff", StringType, true), 74 | StructField("Quarter", IntegerType, true) 75 | ) 76 | ) 77 | } 78 | 79 | class GlobPartitionAndFileNameSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 80 | import GlobPartitionAndFileNameSuite._ 81 | 82 | private val sharedOptions = Map("header" -> true, "inferSchema" -> true) 83 | 84 | test("read multiple files must infer correct schema with inferSchema=true") { 85 | val df = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions) 86 | assert(df.schema == expectedInferredSchema) 87 | } 88 | 89 | test("read multiple files with input_file_name") { 90 | val df = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions) 91 | .withColumn("file_name", input_file_name()) 92 | assert(df.schema == expectedWithFilenameSchema) 93 | 94 | /* And validate list of filename */ 95 | val names = df 96 | .select("file_name") 97 | .distinct() 98 | .collect() 99 | .map(r => r.getString(0)) 100 | .map(p => p.split("[\\/]").last) // this works on Windows too 101 | .toSet 102 | assert(names == Set[String]("ca_10.xlsx", "ca_11.xlsx", "ca_12.xlsx")) 103 | } 104 | 105 | test("read whole folder with partition") { 106 | val df = readFromResources(spark, "ca_dataset/2019", sharedOptions) 107 | assert(df.schema == expectedWithPartitionSchema) 108 | 109 | /* And validate list of Quarters */ 110 | val quarters = df.select("Quarter").distinct().collect().map(r => r.getInt(0)).toSet 111 | assert(quarters == Set[Int](1, 2, 3, 4)) 112 | } 113 | 114 | test("read multiple files must has same number total number of rows") { 115 | val q4_total = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions) 116 | .count() 117 | 118 | val q4_sum = Seq("ca_10.xlsx", "ca_11.xlsx", "ca_12.xlsx") 119 | .map(name => readFromResources(spark, s"ca_dataset/2019/Quarter=4/$name", sharedOptions).count()) 120 | .sum 121 | 122 | assert(q4_total > 0) 123 | assert(q4_total == q4_sum) 124 | } 125 | 126 | } 127 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/InferStricterNumericalTypesSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.util 25 | import scala.jdk.CollectionConverters._ 26 | 27 | object InferStricterNumericalTypesSuite { 28 | val expectedInferredSchema = StructType( 29 | List( 30 | StructField("ID", StringType, true), 31 | StructField("Integer Value Range", IntegerType, true), 32 | StructField("Long Value Range", LongType, true), 33 | StructField("Double Value Range", DoubleType, true) 34 | ) 35 | ) 36 | 37 | /** Stricter type for numerical value 38 | */ 39 | val expectedDataInferSchema: util.List[Row] = List( 40 | Row("Gas & Oil", 2147482967, 92147483647L, 90315085.71d), 41 | Row("Telecomunication", 2147483099, 102147483647L, -965079398.74d), 42 | Row("Manufacturing", 2147482826, 112147483647L, -353020871.56d), 43 | Row("Farming", 2147482838, -102147483647L, -446026564.15d), 44 | Row("Service", 2147483356, -112147483647L, -820766945.73d) 45 | ).asJava 46 | } 47 | 48 | class InferStricterNumericalTypesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 49 | import InferStricterNumericalTypesSuite._ 50 | 51 | test("stricter numerical types usePlainNumberFormat=true and inferSchema=true (xlxs)") { 52 | val df = readFromResources( 53 | spark, 54 | path = "infer_stricter_numerical_types.xlsx", 55 | options = Map("usePlainNumberFormat" -> true, "inferSchema" -> true) 56 | ) 57 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema) 58 | assertDataFrameEquals(expected, df) 59 | } 60 | 61 | test("stricter numerical types usePlainNumberFormat=false and inferSchema=true (xlxs)") { 62 | val df = readFromResources( 63 | spark, 64 | path = "infer_stricter_numerical_types.xlsx", 65 | options = Map("usePlainNumberFormat" -> false, "inferSchema" -> true) 66 | ) 67 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema) 68 | assertDataFrameEquals(expected, df) 69 | } 70 | 71 | test("stricter numerical types usePlainNumberFormat=true and inferSchema=true (xls)") { 72 | val df = readFromResources( 73 | spark, 74 | path = "infer_stricter_numerical_types.xls", 75 | options = Map("usePlainNumberFormat" -> true, "inferSchema" -> true) 76 | ) 77 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema) 78 | assertDataFrameEquals(expected, df) 79 | } 80 | 81 | test("stricter numerical types usePlainNumberFormat=false and inferSchema=true (xls)") { 82 | val df = readFromResources( 83 | spark, 84 | path = "infer_stricter_numerical_types.xls", 85 | options = Map("usePlainNumberFormat" -> false, "inferSchema" -> true) 86 | ) 87 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema) 88 | assertDataFrameEquals(expected, df) 89 | } 90 | } 91 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/KeepUndefinedRowsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.util 25 | import scala.jdk.CollectionConverters._ 26 | 27 | object KeepUndefinedRowsSuite { 28 | 29 | /* Issue: https://github.dev/mauch/spark-excel/issues/285 */ 30 | val expectedSchema_Issue285 = StructType( 31 | List(StructField("1", StringType, true), StructField("2", StringType, true), StructField("3", StringType, true)) 32 | ) 33 | 34 | /** No change to the spark-excel, Apache POI also produce same result with sheet.iterator 35 | * 36 | * Workaround: https://stackoverflow.com/questions/47790569/how-to-avoid-skipping-blank-rows-or-columns-in-apache-poi 37 | * Doc: http://poi.apache.org/components/spreadsheet/quick-guide.html#Iterator 38 | */ 39 | val expectedData_Issue285: util.List[Row] = List( 40 | Row("File info", null, null), 41 | Row("Info", "Info", "Info"), 42 | Row("Metadata", null, null), 43 | Row(null, "1", "2"), 44 | Row("A", "1", "2"), 45 | Row("B", "5", "6"), 46 | Row("C", "9", "10"), 47 | Row("Metadata", null, null), 48 | Row(null, "1", "2"), 49 | Row("A", "1", "2"), 50 | Row("B", "4", "5"), 51 | Row("C", "7", "8") 52 | ).asJava 53 | 54 | /* With newly introduced keepUndefinedRows option */ 55 | val expectedData_KeepUndefinedRows_Issue285: util.List[Row] = List( 56 | Row("File info", null, null), 57 | Row("Info", "Info", "Info"), 58 | Row(null, null, null), 59 | Row("Metadata", null, null), 60 | Row(null, null, null), 61 | Row(null, "1", "2"), 62 | Row("A", "1", "2"), 63 | Row("B", "5", "6"), 64 | Row("C", "9", "10"), 65 | Row(null, null, null), 66 | Row(null, null, null), 67 | Row("Metadata", null, null), 68 | Row(null, null, null), 69 | Row(null, "1", "2"), 70 | Row("A", "1", "2"), 71 | Row("B", "4", "5"), 72 | Row("C", "7", "8") 73 | ).asJava 74 | 75 | /** Issue: https://github.dev/mauch/spark-excel/issues/162 Spark-excel still infers to Double-Type, however, user can 76 | * provide custom scheme and Spark-excel should load to IntegerType or LongType accordingly 77 | */ 78 | val userDefined_Issue162 = StructType( 79 | List( 80 | StructField("ID", IntegerType, true), 81 | StructField("address", StringType, true), 82 | StructField("Pin", IntegerType, true) 83 | ) 84 | ) 85 | 86 | val expectedData_Issue162: util.List[Row] = 87 | List(Row(123123, "Asdadsas, Xyxyxy, 123xyz", 123132), Row(123124, "Asdadsas1, Xyxyxy, 123xyz", 123133)).asJava 88 | 89 | } 90 | 91 | class KeepUndefinedRowsSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 92 | import KeepUndefinedRowsSuite._ 93 | 94 | test("#285 undefined rows: no keep") { 95 | val df = readFromResources( 96 | spark, 97 | path = "issue_285_bryce21.xlsx", 98 | options = Map("header" -> false, "inferSchema" -> false, "keepUndefinedRows" -> false), 99 | schema = expectedSchema_Issue285 100 | ) 101 | val expected = spark.createDataFrame(expectedData_Issue285, expectedSchema_Issue285) 102 | assertDataFrameEquals(expected, df) 103 | } 104 | 105 | test("#162 load integer values with user defined schema") { 106 | val df = readFromResources( 107 | spark, 108 | path = "issue_162_nihar_gharat.xlsx", 109 | options = Map("header" -> true), 110 | schema = userDefined_Issue162 111 | ) 112 | val expected = spark.createDataFrame(expectedData_Issue162, userDefined_Issue162) 113 | assertDataFrameEquals(expected, df) 114 | } 115 | 116 | for (sheetName <- Seq("blank_row", "space_row")) { 117 | test(s"#965 handling of NULL/BLANK column values (streamingReader, keepUndefinedRows==false, sheet=$sheetName)") { 118 | val df = readFromResources( 119 | spark, 120 | path = "issue_965_blank_rows.xlsx", 121 | options = Map( 122 | "dataAddress" -> s"'${sheetName}'!A1", 123 | "inferSchema" -> true, 124 | "header" -> true, 125 | "maxRowsInMemory" -> "1000", 126 | "keepUndefinedRows" -> false 127 | ) 128 | ) 129 | assert(df.schema.fields.length == 5) // sheet 001 has 5 columns 130 | /* 131 | sheet "blank_row" has row 2 and 4 defined, while row 3 is not defined in excel xml and row 5 contains empty cells in excel xml 132 | => 2 rows in total (prior the fix row 5 was added as well) 133 | sheet "space_row" has row 2 and 4 defined with some values, row 3 contains just a whitespace in A3 134 | => 3 rows in total (just to test that a single whitespace is handled correctly) 135 | */ 136 | if (sheetName == "blank_row") { 137 | assert(df.count() == 2) 138 | } else { 139 | assert(df.count() == 3) 140 | } 141 | } 142 | } 143 | 144 | for (keepUndefinedRows <- Seq(false, true)) { 145 | test(s"#965 handling of NULL/BLANK column values (NON-streaming-Reader, keepUndefinedRows==$keepUndefinedRows)") { 146 | val df = readFromResources( 147 | spark, 148 | path = "issue_965_blank_rows.xlsx", 149 | options = Map( 150 | "dataAddress" -> s"'blank_row'!A1", 151 | "inferSchema" -> true, 152 | "header" -> true, 153 | "keepUndefinedRows" -> keepUndefinedRows 154 | ) 155 | ) 156 | assert(df.schema.fields.length == 5) // sheet 001 has 5 columns 157 | /* 158 | sheet "blank_row" has row 2 and 4 defined, while row 3 is not defined in excel xml and row 5 contains empty cells in excel xml 159 | * keepUndefinedRows == true => 4 rows in total 160 | * keepUndefinedRows == false => 2 rows in total 161 | */ 162 | if (keepUndefinedRows) { 163 | assert(df.count() == 4) 164 | } else { 165 | assert(df.count() == 2) 166 | } 167 | } 168 | } 169 | 170 | } 171 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/LocalFileTestingUtilities.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import java.io.File 20 | import java.nio.file.Files 21 | 22 | trait LocalFileTestingUtilities { 23 | 24 | /** Returns the list of files in given directory/folder (this is not recursive) 25 | */ 26 | def getListOfFiles(folder: String): List[File] = { 27 | val d = new File(folder) 28 | if (d.exists && d.isDirectory) { 29 | d.listFiles.filter(_.isFile).toList 30 | } else { 31 | List[File]() 32 | } 33 | } 34 | 35 | /** similar to getListOfFiles but filters the files by the given file extension */ 36 | def getListOfFilesFilteredByExtension(targetDir: String, filteredByExtension: String): Seq[String] = { 37 | val filesInTargetDir = getListOfFiles(targetDir) 38 | filesInTargetDir.filter(_.getName.endsWith(filteredByExtension)).map(_.getName) 39 | } 40 | 41 | /** Returns the list of folders in given directory/folder (this is not recursive */ 42 | def getListOfFolders(folder: String): List[File] = { 43 | val d = new File(folder) 44 | if (d.exists && d.isDirectory) { 45 | d.listFiles.filter(_.isDirectory).toList 46 | } else { 47 | List[File]() 48 | } 49 | } 50 | 51 | /** Deletes the (non-empty) directory (recursively) 52 | */ 53 | def deleteDirectoryRecursively(folderToDelete: File): Unit = { 54 | val allContents = folderToDelete.listFiles 55 | if (allContents != null) for (file <- allContents) { 56 | deleteDirectoryRecursively(file) 57 | } 58 | folderToDelete.delete 59 | () 60 | } 61 | 62 | /** fixture that creates a temporary folder and deletes it after test completion */ 63 | def withExistingCleanTempDir(name: String): (String => Unit) => Unit = { 64 | 65 | def fixture(testCode: String => Unit): Unit = { 66 | 67 | val directory = Files.createTempDirectory(name) 68 | 69 | try testCode(directory.toString) 70 | finally deleteDirectoryRecursively(directory.toFile) 71 | } 72 | 73 | fixture 74 | } 75 | 76 | } 77 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/ManyPartitionReadSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql._ 21 | import org.apache.spark.sql.functions.col 22 | import org.apache.spark.sql.types.IntegerType 23 | import org.scalatest.wordspec.AnyWordSpec 24 | import org.apache.spark.sql.types.{StringType, StructField, StructType} 25 | 26 | class ManyPartitionReadSuite extends AnyWordSpec with DataFrameSuiteBase with LocalFileTestingUtilities { 27 | 28 | /** Checks that the excel data files in given folder equal the provided dataframe */ 29 | private def assertWrittenExcelData(expectedDf: DataFrame, folder: String): Unit = { 30 | val actualDf = spark.read 31 | .format("excel") 32 | .option("path", folder) 33 | .load() 34 | 35 | /* assertDataFrameNoOrderEquals is sensitive to order of columns, so we 36 | order both dataframes in the same way 37 | */ 38 | val orderedSchemaColumns = expectedDf.schema.fields.map(f => f.name).sorted 39 | 40 | assertDataFrameNoOrderEquals( 41 | expectedDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*), 42 | actualDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*) 43 | ) 44 | 45 | } 46 | 47 | def createExpected(targetDir: String): DataFrame = { 48 | 49 | // Generate data programmatically 50 | val data = (1 to 19).flatMap { col1 => 51 | // Each col1 value has multiple rows (around 10-11 rows each) 52 | val rowsPerPartition = if (col1 == 1) 8 else if (col1 == 2) 16 else 11 53 | (0 until rowsPerPartition).map { i => 54 | val index = (col1 - 1) * 11 + i + 1234 // Starting from 1234 as in original data 55 | Row( 56 | Integer.valueOf(col1), // Make it nullable Integer 57 | s"fubar_$index", 58 | s"bazbang_${index + 77000}", 59 | s"barfang_${index + 237708}", 60 | s"lorem_ipsum_$index" 61 | ) 62 | } 63 | } 64 | 65 | // Define schema explicitly to match expected nullability 66 | val schema = StructType( 67 | Array( 68 | StructField("col1", IntegerType, nullable = true), 69 | StructField("col2", StringType, nullable = true), 70 | StructField("col3", StringType, nullable = true), 71 | StructField("col4", StringType, nullable = true), 72 | StructField("col5", StringType, nullable = true) 73 | ) 74 | ) 75 | 76 | val dfInput = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) 77 | 78 | val dfFinal = dfInput.union(dfInput) 79 | 80 | val dfWriter = dfFinal.write 81 | .partitionBy("col1") 82 | .format("excel") 83 | .option("path", targetDir) 84 | .option("header", value = true) 85 | .mode(SaveMode.Append) 86 | 87 | dfWriter.save() 88 | dfWriter.save() 89 | 90 | val orderedSchemaColumns = dfInput.schema.fields.map(f => f.name).sorted 91 | 92 | dfFinal 93 | .union(dfFinal) 94 | .withColumn("col1", col("col1").cast(IntegerType)) 95 | .select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*) 96 | 97 | } 98 | 99 | for (run <- Range(0, 3)) { 100 | 101 | s"many partitions read (run=$run)" in withExistingCleanTempDir("v2") { targetDir => 102 | assume(spark.sparkContext.version >= "3.0.1") 103 | val expectedDf = createExpected(targetDir) 104 | assertWrittenExcelData(expectedDf, targetDir) 105 | } 106 | } 107 | 108 | } 109 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/NumericTypesSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.util 25 | import scala.jdk.CollectionConverters._ 26 | 27 | /** For schema infering as well as loading for various numeric types {Integer, Long, Double} 28 | */ 29 | object NumericTypesSuite { 30 | 31 | val userDefinedSchema_01 = StructType( 32 | List( 33 | StructField("Day", IntegerType, true), 34 | StructField("Month", IntegerType, true), 35 | StructField("Customer ID", StringType, true), 36 | StructField("Customer Name", StringType, true), 37 | StructField("Standard Package", IntegerType, true), 38 | StructField("Extra Option 1", IntegerType, true), 39 | StructField("Extra Option 2", IntegerType, true), 40 | StructField("Extra Option 3", IntegerType, true), 41 | StructField("Staff", StringType, true) 42 | ) 43 | ) 44 | 45 | val expectedData_01: util.List[Row] = List( 46 | Row(1, 12, "CA869", "Phạm Uyển Trinh", null, null, 2200, null, "Ella Fitzgerald"), 47 | Row(1, 12, "CA870", "Nguyễn Liên Thảo", null, null, 2000, 1350, "Ella Fitzgerald"), 48 | Row(1, 12, "CA871", "Lê Thị Nga", 17000, null, null, null, "Ella Fitzgerald"), 49 | Row(1, 12, "CA872", "Phan Tố Nga", null, null, 2000, null, "Teresa Teng"), 50 | Row(1, 12, "CA873", "Nguyễn Thị Teresa Teng", null, null, 1200, null, "Jesse Thomas") 51 | ).asJava 52 | 53 | val userDefinedSchema_02 = StructType( 54 | List( 55 | StructField("Day", LongType, true), 56 | StructField("Month", LongType, true), 57 | StructField("Customer ID", StringType, true), 58 | StructField("Customer Name", StringType, true), 59 | StructField("Standard Package", IntegerType, true), 60 | StructField("Extra Option 1", IntegerType, true), 61 | StructField("Extra Option 2", IntegerType, true), 62 | StructField("Extra Option 3", LongType, true), 63 | StructField("Staff", StringType, true) 64 | ) 65 | ) 66 | 67 | val expectedData_02: util.List[Row] = List( 68 | Row(1L, 12L, "CA869", "Phạm Uyển Trinh", null, null, 2200, null, "Ella Fitzgerald"), 69 | Row(1L, 12L, "CA870", "Nguyễn Liên Thảo", null, null, 2000, 1350L, "Ella Fitzgerald"), 70 | Row(1L, 12L, "CA871", "Lê Thị Nga", 17000, null, null, null, "Ella Fitzgerald"), 71 | Row(1L, 12L, "CA872", "Phan Tố Nga", null, null, 2000, null, "Teresa Teng"), 72 | Row(1L, 12L, "CA873", "Nguyễn Thị Teresa Teng", null, null, 1200, null, "Jesse Thomas") 73 | ).asJava 74 | } 75 | 76 | class NumericTypesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 77 | import NumericTypesSuite._ 78 | 79 | test("load with user defined schema with Integer types") { 80 | val df = readFromResources( 81 | spark, 82 | path = "ca_dataset/2019/Quarter=4/ca_12.xlsx", 83 | options = Map("header" -> true), 84 | schema = userDefinedSchema_01 85 | ).limit(5) 86 | val expected = spark.createDataFrame(expectedData_01, userDefinedSchema_01) 87 | 88 | assertDataFrameEquals(expected, df) 89 | } 90 | 91 | test("load with user defined schema with both Integer and Long types") { 92 | val df = readFromResources( 93 | spark, 94 | path = "ca_dataset/2019/Quarter=4/ca_12.xlsx", 95 | options = Map("header" -> true), 96 | schema = userDefinedSchema_02 97 | ).limit(5) 98 | val expected = spark.createDataFrame(expectedData_02, userDefinedSchema_02) 99 | 100 | assertDataFrameEquals(expected, df) 101 | } 102 | } 103 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/RowNumberColumnSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.util 25 | import scala.jdk.CollectionConverters._ 26 | 27 | /** Related issues: #40 Allow reading only a subset of rows https://github.dev/mauch/spark-excel/issues/40 #59 Rows are 28 | * returned in incorrect order on cluster https://github.dev/mauch/spark-excel/issues/59 #115 Add excel row number 29 | * column https://github.dev/mauch/spark-excel/issues/115 30 | */ 31 | object RowNumberColumnSuite { 32 | 33 | val expectedSchema = StructType( 34 | List( 35 | StructField("RowID", IntegerType, true), 36 | StructField("1", StringType, true), 37 | StructField("2", StringType, true), 38 | StructField("3", StringType, true) 39 | ) 40 | ) 41 | 42 | val expectedData_NoKeep: util.List[Row] = List( 43 | Row(0, "File info", null, null), 44 | Row(1, "Info", "Info", "Info"), 45 | Row(3, "Metadata", null, null), 46 | Row(5, null, "1", "2"), 47 | Row(6, "A", "1", "2"), 48 | Row(7, "B", "5", "6"), 49 | Row(8, "C", "9", "10"), 50 | Row(11, "Metadata", null, null), 51 | Row(13, null, "1", "2"), 52 | Row(14, "A", "1", "2"), 53 | Row(15, "B", "4", "5"), 54 | Row(16, "C", "7", "8") 55 | ).asJava 56 | 57 | val expectedData_Keep: util.List[Row] = List( 58 | Row(0, "File info", null, null), 59 | Row(1, "Info", "Info", "Info"), 60 | Row(null, null, null, null), 61 | Row(3, "Metadata", null, null), 62 | Row(null, null, null, null), 63 | Row(5, null, "1", "2"), 64 | Row(6, "A", "1", "2"), 65 | Row(7, "B", "5", "6"), 66 | Row(8, "C", "9", "10"), 67 | Row(null, null, null, null), 68 | Row(null, null, null, null), 69 | Row(11, "Metadata", null, null), 70 | Row(null, null, null, null), 71 | Row(13, null, "1", "2"), 72 | Row(14, "A", "1", "2"), 73 | Row(15, "B", "4", "5"), 74 | Row(16, "C", "7", "8") 75 | ).asJava 76 | 77 | val expectedSchema_Projection = StructType( 78 | List( 79 | StructField("3", StringType, true), 80 | StructField("RowID", IntegerType, true), 81 | StructField("2", StringType, true) 82 | ) 83 | ) 84 | 85 | val expectedData_Projection: util.List[Row] = List( 86 | Row(null, 0, null), 87 | Row("Info", 1, "Info"), 88 | Row(null, 3, null), 89 | Row("2", 5, "1"), 90 | Row("2", 6, "1"), 91 | Row("6", 7, "5"), 92 | Row("10", 8, "9"), 93 | Row(null, 11, null), 94 | Row("2", 13, "1"), 95 | Row("2", 14, "1"), 96 | Row("5", 15, "4"), 97 | Row("8", 16, "7") 98 | ).asJava 99 | 100 | } 101 | 102 | class RowNumberColumnSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 103 | import RowNumberColumnSuite._ 104 | 105 | test("read with addition excel row number column") { 106 | val df = readFromResources( 107 | spark, 108 | path = "issue_285_bryce21.xlsx", 109 | Map("header" -> false, "keepUndefinedRows" -> false, "columnNameOfRowNumber" -> "RowID"), 110 | schema = expectedSchema 111 | ) 112 | val expected = spark.createDataFrame(expectedData_NoKeep, expectedSchema) 113 | assertDataFrameEquals(expected, df) 114 | } 115 | 116 | test("read with addition excel row number column, keep undefined rows") { 117 | val df = readFromResources( 118 | spark, 119 | path = "/issue_285_bryce21.xlsx", 120 | Map("header" -> false, "keepUndefinedRows" -> true, "columnNameOfRowNumber" -> "RowID"), 121 | schema = expectedSchema 122 | ) 123 | val expected = spark.createDataFrame(expectedData_Keep, expectedSchema) 124 | assertDataFrameEquals(expected, df) 125 | } 126 | 127 | test("read with addition excel row number column, projection") { 128 | val df = readFromResources( 129 | spark, 130 | path = "/issue_285_bryce21.xlsx", 131 | Map("header" -> false, "keepUndefinedRows" -> false, "columnNameOfRowNumber" -> "RowID"), 132 | schema = expectedSchema 133 | ).select("3", "RowID", "2") 134 | val expected = spark.createDataFrame(expectedData_Projection, expectedSchema_Projection) 135 | assertDataFrameEquals(expected, df) 136 | } 137 | } 138 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/spark/excel/v2/UserReportedIssuesSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch.spark.excel.v2 18 | 19 | import dev.mauch.spark.DataFrameSuiteBase 20 | import org.apache.spark.sql.Row 21 | import org.apache.spark.sql.types._ 22 | import org.scalatest.funsuite.AnyFunSuite 23 | 24 | import java.util 25 | import scala.jdk.CollectionConverters._ 26 | import java.sql.Date 27 | 28 | object UserReportedIssuesSuite { 29 | 30 | /** Issue: https://github.dev/mauch/spark-excel/issues/463 Cannot load Date and Decimal fields 31 | */ 32 | val userDefined_Issue463 = StructType( 33 | List( 34 | StructField("itm no", StringType, true), 35 | StructField("Expense", DecimalType(23, 10), true), 36 | StructField("Date", DateType, true) 37 | ) 38 | ) 39 | 40 | val expectedData_Issue463: util.List[Row] = 41 | List(Row("item1", Decimal("1.1"), Date.valueOf("2021-10-01"))).asJava 42 | 43 | } 44 | 45 | class UserReportedIssuesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities { 46 | import UserReportedIssuesSuite._ 47 | 48 | test("#463 Date and decimal with user defined schema") { 49 | val df = readFromResources( 50 | spark, 51 | path = "issue_463_cristichircu.xlsx", 52 | options = Map("header" -> true), 53 | schema = userDefined_Issue463 54 | ) 55 | val expected = spark.createDataFrame(expectedData_Issue463, userDefined_Issue463) 56 | assertDataFrameEquals(expected, df) 57 | } 58 | } 59 | -------------------------------------------------------------------------------- /src/test/scala/dev/mauch/tags/package.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2022 Martin Mauch (@nightscape) 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package dev.mauch 18 | 19 | import org.scalatest.Tag 20 | 21 | package object tags { 22 | object WIP extends Tag("dev.mauch.tags.WIP") 23 | } 24 | --------------------------------------------------------------------------------