├── .git-blame-ignore-revs
├── .github
    ├── ISSUE_TEMPLATE
    │   ├── config.yml
    │   └── generic.yml
    ├── dependabot.yml
    └── workflows
    │   ├── changelog.yaml
    │   ├── ci.yml
    │   ├── clean.yml
    │   ├── dependency-graph.yml
    │   ├── potential-duplicates.yml
    │   ├── pr-agent.yaml
    │   └── rebase.yml
├── .gitignore
├── .mill-jvm-opts
├── .mill-version
├── .scala-steward.conf
├── .scalafmt.conf
├── CHANGELOG.md
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE.md
├── LICENSE
├── README.md
├── build.mill
├── docs
    ├── README.md
    ├── azure_synapse.md
    └── spark_excel_examples.ipynb
├── mill
├── private-key.pem.enc
├── scalastyle-config.xml
└── src
    ├── README.md
    ├── main
        ├── 2.4
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               ├── ExcelSparkInternal.scala
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelDataSource.scala
        │   │                       ├── ExcelDateTimeStringUtils.scala
        │   │                       ├── ExcelFilters.scala
        │   │                       ├── ExcelOptions.scala
        │   │                       ├── ExcelParserBase.scala
        │   │                       ├── FailureSafeParser.scala
        │   │                       └── SchemaUtils.scala
        ├── 3.0
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelDateTimeStringUtils.scala
        │   │                       └── ExcelFilters.scala
        ├── 3.0_and_up
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelDataSource.scala
        │   │                       └── ExcelFileFormat.scala
        ├── 3.0_to_3.1
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelOutputWriter.scala
        │   │                       ├── ExcelTable.scala
        │   │                       └── ExcelWriteBuilder.scala
        ├── 3.0_to_3.2
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelScan.scala
        │   │                       └── ExcelScanBuilder.scala
        ├── 3.0_to_3.3
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       └── ExcelOptions.scala
        ├── 3.0_to_3.4.1
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelParserBase.scala
        │   │                       └── ExcelPartitionReaderFactory.scala
        ├── 3.1
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       └── ExcelDateTimeStringUtils.scala
        ├── 3.1_and_up
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       └── ExcelFilters.scala
        ├── 3.2_and_up
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelDateTimeStringUtils.scala
        │   │                       ├── ExcelOutputWriter.scala
        │   │                       ├── ExcelTable.scala
        │   │                       └── ExcelWriteBuilder.scala
        ├── 3.3_and_up
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelScan.scala
        │   │                       └── ExcelScanBuilder.scala
        ├── 3.4.2_and_up
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       ├── ExcelParserBase.scala
        │   │                       └── ExcelPartitionReaderFactory.scala
        ├── 3.4_and_up
        │   └── scala
        │   │   └── dev
        │   │       └── mauch
        │   │           └── spark
        │   │               └── excel
        │   │                   └── v2
        │   │                       └── ExcelOptions.scala
        ├── resources
        │   └── META-INF
        │   │   └── services
        │   │       └── org.apache.spark.sql.sources.DataSourceRegister
        └── scala
        │   └── dev
        │       └── mauch
        │           └── spark
        │               └── excel
        │                   ├── DataColumn.scala
        │                   ├── DataLocator.scala
        │                   ├── DefaultSource.scala
        │                   ├── DefaultSource15.scala
        │                   ├── ExcelFileSaver.scala
        │                   ├── ExcelRelation.scala
        │                   ├── InferSchema.scala
        │                   ├── PlainNumberFormat.scala
        │                   ├── Utils.scala
        │                   ├── WorkbookReader.scala
        │                   ├── package.scala
        │                   └── v2
        │                       ├── DataLocator.scala
        │                       ├── ExcelGenerator.scala
        │                       ├── ExcelHeaderChecker.scala
        │                       ├── ExcelHelper.scala
        │                       ├── ExcelInferSchema.scala
        │                       ├── ExcelOptionsTrait.scala
        │                       ├── ExcelParser.scala
        │                       └── SheetData.scala
    └── test
        ├── resources
            ├── log4j2.properties
            └── spreadsheets
            │   ├── Issue_747_plain_number.xlsx
            │   ├── apache_poi
            │       ├── 57231_MixedGasReport.xls
            │       └── DataTableCities.xlsx
            │   ├── ca_dataset
            │       └── 2019
            │       │   ├── Quarter=1
            │       │       └── ca_03.xlsx
            │       │   ├── Quarter=2
            │       │       ├── ca_04.xlsx
            │       │       ├── ca_05.xlsx
            │       │       └── ca_06.xlsx
            │       │   ├── Quarter=3
            │       │       ├── ca_07.xlsx
            │       │       ├── ca_08.xlsx
            │       │       └── ca_09.xlsx
            │       │   └── Quarter=4
            │       │       ├── ca_10.xlsx
            │       │       ├── ca_11.xlsx
            │       │       └── ca_12.xlsx
            │   ├── infer_stricter_numerical_types.xls
            │   ├── infer_stricter_numerical_types.xlsx
            │   ├── issue_162_nihar_gharat.xlsx
            │   ├── issue_285_bryce21.xlsx
            │   ├── issue_463_cristichircu.xlsx
            │   ├── issue_942_sheetname_digits.xlsx
            │   ├── issue_944_faulty_dimension.md
            │   ├── issue_944_faulty_dimension.xlsx
            │   ├── issue_965_blank_rows.md
            │   ├── issue_965_blank_rows.xlsx
            │   ├── plain_number.xlsx
            │   ├── read_multiple_sheets_at_once.xlsx
            │   ├── read_multiple_sheets_at_once_noheader.xlsx
            │   ├── simple_encrypted.xls
            │   ├── simple_encrypted.xlsx
            │   └── with_errors_all_types.xlsx
        └── scala
            └── dev
                └── mauch
                    ├── spark
                        ├── DataFrameSuiteBase.scala
                        └── excel
                        │   ├── DataLocatorSuite.scala
                        │   ├── EncryptedReadSuite.scala
                        │   ├── ErrorsAsStringsReadSuite.scala
                        │   ├── Generators.scala
                        │   ├── IntegrationSuite.scala
                        │   ├── PlainNumberReadSuite.scala
                        │   ├── RichRowSuite.scala
                        │   └── v2
                        │       ├── AreaReferenceReadSuite.scala
                        │       ├── DataFrameWriterApiComplianceSuite.scala
                        │       ├── EncryptedReadSuite.scala
                        │       ├── ErrorsAsStringsReadSuite.scala
                        │       ├── ExcelTestingUtilities.scala
                        │       ├── GlobPartitionAndFileNameSuite.scala
                        │       ├── InferStricterNumericalTypesSuite.scala
                        │       ├── KeepUndefinedRowsSuite.scala
                        │       ├── LocalFileTestingUtilities.scala
                        │       ├── ManyPartitionReadSuite.scala
                        │       ├── NumericTypesSuite.scala
                        │       ├── PlainNumberReadSuite.scala
                        │       ├── ProjectionAndFilterPushdownSuite.scala
                        │       ├── RowNumberColumnSuite.scala
                        │       ├── TableReadSuite.scala
                        │       ├── UserReportedIssuesSuite.scala
                        │       └── WriteAndReadSuite.scala
                    └── tags
                        └── package.scala


/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
 1 | # Scala Steward: Reformat with scalafmt 3.6.1
 2 | a834cf94453ed2f3ab1b87818c2fd124fe87fa2a
 3 | 
 4 | # Scala Steward: Reformat with scalafmt 3.7.11
 5 | 11269e71a3460ae21f2a96ac8416c0bdd3f1f3b0
 6 | 
 7 | # Scala Steward: Reformat with scalafmt 3.7.15
 8 | 17f6ce5807fb3a91938824a285e30f786adea570
 9 | 
10 | # Scala Steward: Reformat with scalafmt 3.7.17
11 | e4fde8d1e6e34db2d24949275429ce3a7885c2ad
12 | 
13 | # Scala Steward: Reformat with scalafmt 3.8.5
14 | 59dd3ea00b8772fd4e8798fde7941c1745ca83f2
15 | 
16 | # Scala Steward: Reformat with scalafmt 3.9.5
17 | 19da40630c2645140336554bbce4a48881367bd2
18 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/generic.yml:
--------------------------------------------------------------------------------
 1 | name: 🐞 Bug
 2 | description: File a bug/issue
 3 | title: "[BUG] <title>"
 4 | labels: [Bug, Needs Triage]
 5 | body:
 6 | - type: checkboxes
 7 |   attributes:
 8 |     label: Am I using the newest version of the library?
 9 |     description: Please always use the latest version before posting any issues. Your bug might already have been solved..
10 |     options:
11 |     - label: I have made sure that I'm using the latest version of the library.
12 |       required: true
13 | - type: checkboxes
14 |   attributes:
15 |     label: Is there an existing issue for this?
16 |     description: Please search to see if an issue already exists for the bug you encountered.
17 |     options:
18 |     - label: I have searched the existing issues
19 |       required: true
20 | - type: textarea
21 |   attributes:
22 |     label: Current Behavior
23 |     description: A concise description of what you're experiencing.
24 |   validations:
25 |     required: false
26 | - type: textarea
27 |   attributes:
28 |     label: Expected Behavior
29 |     description: A concise description of what you expected to happen.
30 |   validations:
31 |     required: false
32 | - type: textarea
33 |   attributes:
34 |     label: Steps To Reproduce
35 |     description: Steps to reproduce the behavior.
36 |     placeholder: |
37 |       Steps to Reproduce (for bugs)
38 |       Provide a link to a live example, or an unambiguous set of steps to reproduce this bug. Include code to reproduce, if relevant. Example:
39 |       Download the example file uploaded here
40 |       Start Spark from command line as spark-shell --packages dev.mauch:spark-excel_2.12:x.y.z --foo=bar
41 |       Read the downloaded example file
42 |       val df = spark.read
43 |         .format("dev.mauch.spark.excel")
44 |         .option("dataAddress", "'My Sheet'!B3:C35")
45 |         .load("example_file_exhibiting_bug.xlsx")
46 |   validations:
47 |     required: false
48 | - type: textarea
49 |   attributes:
50 |     label: Environment
51 |     description: |
52 |       examples:
53 |         Include as many relevant details about the environment you experienced the bug in
54 |         Spark version and language (Scala, Java, Python, R, ...):
55 |         Spark-Excel version:
56 |         Operating System and versioncluster environment, ...:
57 |     value: |
58 |         - Spark version:
59 |         - Spark-Excel version:
60 |         - OS:
61 |         - Cluster environment
62 |     render: markdown
63 |   validations:
64 |     required: false
65 | - type: textarea
66 |   attributes:
67 |     label: Anything else?
68 |     description: |
69 |       Links? References? Anything that will give us more context about the issue you are encountering!
70 | 
71 |       Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
72 |   validations:
73 |     required: false
74 | 


--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
 1 | # To get started with Dependabot version updates, you'll need to specify which
 2 | # package ecosystems to update and where the package manifests are located.
 3 | # Please see the documentation for all configuration options:
 4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
 5 | 
 6 | version: 2
 7 | updates:
 8 |   - package-ecosystem: "github-actions" # See documentation for possible values
 9 |     directory: "/" # Location of package manifests
10 |     schedule:
11 |       interval: "weekly"
12 | 


--------------------------------------------------------------------------------
/.github/workflows/changelog.yaml:
--------------------------------------------------------------------------------
 1 | name: Changelog
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - v[0-9]+.[0-9]+.[0-9]+
 7 | 
 8 | jobs:
 9 |   deploy:
10 |     runs-on: ubuntu-latest
11 | 
12 |     steps:
13 |       - name: Checkout Code
14 |         uses: actions/checkout@v4
15 | 
16 |       - name: Update CHANGELOG
17 |         id: changelog
18 |         uses: Requarks/changelog-action@v1
19 |         with:
20 |           token: ${{ github.token }}
21 |           tag: ${{ github.ref_name }}
22 | 
23 |       - name: Commit CHANGELOG.md
24 |         uses: stefanzweifel/git-auto-commit-action@v5
25 |         with:
26 |           branch: main
27 |           commit_message: 'docs: update CHANGELOG.md for ${{ github.ref_name }} [skip ci]'
28 |           file_pattern: CHANGELOG.md
29 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
  1 | name: Continuous Integration
  2 | 
  3 | on:
  4 |   pull_request:
  5 |     branches: ['**', '!update/**', '!pr/**']
  6 |   push:
  7 |     branches: ['**', '!update/**', '!pr/**']
  8 |     tags: [v*]
  9 | 
 10 | env:
 11 |   PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
 12 |   SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
 13 |   SONATYPE_CREDENTIAL_HOST: ${{ secrets.SONATYPE_CREDENTIAL_HOST }}
 14 |   SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
 15 |   PGP_SECRET: ${{ secrets.PGP_SECRET }}
 16 |   GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
 17 | 
 18 | jobs:
 19 |   prepare:
 20 |     runs-on: ubuntu-latest
 21 |     outputs:
 22 |       matrix: ${{ steps.set-matrix.outputs.matrix }}
 23 |     steps:
 24 |       - name: Checkout
 25 |         uses: actions/checkout@v4
 26 | 
 27 |       - name: Generate matrix
 28 |         id: set-matrix
 29 |         run: |
 30 |           echo -n "matrix=" >> $GITHUB_OUTPUT
 31 |           ./mill resolve "spark-excel[_,_]" | \
 32 |           jq -Rsc 'split("\n") | map(capture("spark-excel\\[(?<scala>[^,]+),(?<spark>[^\\]]+)\\]") | select(.)) | {include: .}' >> $GITHUB_OUTPUT
 33 | 
 34 |   build:
 35 |     needs: prepare
 36 |     name: Build and Test
 37 |     strategy:
 38 |       fail-fast: false
 39 |       matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
 40 |     runs-on: ubuntu-latest
 41 |     steps:
 42 |       - name: Checkout current branch (full)
 43 |         uses: actions/checkout@v4
 44 |         with:
 45 |           fetch-depth: 0
 46 | 
 47 |       - name: Download Java (temurin@11)
 48 |         id: download-java-temurin-11
 49 |         uses: typelevel/download-java@v2
 50 |         with:
 51 |           distribution: temurin
 52 |           java-version: 11
 53 | 
 54 |       - name: Setup Java (temurin@11)
 55 |         uses: actions/setup-java@v4
 56 |         with:
 57 |           distribution: jdkfile
 58 |           java-version: 11
 59 |           jdkFile: ${{ steps.download-java-temurin-11.outputs.jdkFile }}
 60 | 
 61 |       - name: Cache mill
 62 |         uses: actions/cache@v4
 63 |         with:
 64 |           path: |
 65 |             ~/.mill
 66 |             ~/.ivy2/cache
 67 |             ~/.coursier/cache/v1
 68 |             ~/.cache/coursier/v1
 69 |             ~/AppData/Local/Coursier/Cache/v1
 70 |             ~/Library/Caches/Coursier/v1
 71 |           key: ${{ runner.os }}-mill-cache-v2-${{ hashFiles('**/*.mill') }}-${{ hashFiles('project/build.properties') }}
 72 | 
 73 |       - name: Test
 74 |         run: ./mill spark-excel[${{ matrix.scala }},${{ matrix.spark }}].test
 75 | 
 76 |       - name: Publish Test Report
 77 |         uses: mikepenz/action-junit-report@v5
 78 |         if: always() # always run even if the previous step fails
 79 |         with:
 80 |           fail_on_failure: false
 81 |           include_passed: false
 82 |           detailed_summary: true
 83 |           annotate_only: true
 84 |           require_tests: false
 85 |           report_paths: 'out/**/test-report.xml'
 86 | 
 87 |   publish:
 88 |     name: Publish Artifacts
 89 |     needs: [build]
 90 |     if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
 91 |     strategy:
 92 |       matrix:
 93 |         os: [ubuntu-latest]
 94 |         scala: [2.12.20]
 95 |         java: [temurin@11]
 96 |     runs-on: ${{ matrix.os }}
 97 |     steps:
 98 |       - name: Checkout current branch (full)
 99 |         uses: actions/checkout@v4
100 |         with:
101 |           fetch-depth: 0
102 | 
103 |       - name: Download Java (temurin@11)
104 |         id: download-java-temurin-11
105 |         if: matrix.java == 'temurin@11'
106 |         uses: typelevel/download-java@v2
107 |         with:
108 |           distribution: temurin
109 |           java-version: 11
110 | 
111 |       - name: Setup Java (temurin@11)
112 |         if: matrix.java == 'temurin@11'
113 |         uses: actions/setup-java@v4
114 |         with:
115 |           distribution: jdkfile
116 |           java-version: 11
117 |           jdkFile: ${{ steps.download-java-temurin-11.outputs.jdkFile }}
118 | 
119 |       - name: Cache mill
120 |         uses: actions/cache@v4
121 |         with:
122 |           path: |
123 |             ~/.mill
124 |             ~/.ivy2/cache
125 |             ~/.coursier/cache/v1
126 |             ~/.cache/coursier/v1
127 |             ~/AppData/Local/Coursier/Cache/v1
128 |             ~/Library/Caches/Coursier/v1
129 |           key: ${{ runner.os }}-mill-cache-v2-${{ hashFiles('**/*.mill') }}-${{ hashFiles('project/build.properties') }}
130 | 
131 |       - name: Import GPG Key
132 |         uses: crazy-max/ghaction-import-gpg@v6
133 |         with:
134 |           gpg_private_key: ${{ secrets.PGP_SECRET }}
135 |           passphrase: ${{ secrets.PGP_PASSPHRASE }}
136 |           trust_level: 5
137 | 
138 |       - name: Publish
139 |         run: |
140 |           export GPG_TTY=$(tty)
141 |           ./mill -i mill.scalalib.SonatypeCentralPublishModule/ \
142 |             --username $SONATYPE_USERNAME \
143 |             --password $SONATYPE_PASSWORD \
144 |             --gpgArgs "--passphrase=$PGP_PASSPHRASE,--no-tty,--pinentry-mode,loopback,--batch,--yes,-a,-b" \
145 |             --bundleName dev.mauch-spark-excel-$(date +%Y-%m-%d-%H-%M)
146 | 


--------------------------------------------------------------------------------
/.github/workflows/clean.yml:
--------------------------------------------------------------------------------
 1 | name: Clean
 2 | 
 3 | on: push
 4 | 
 5 | jobs:
 6 |   delete-artifacts:
 7 |     name: Delete Artifacts
 8 |     runs-on: ubuntu-latest
 9 |     env:
10 |       GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11 |     steps:
12 |       - name: Delete artifacts
13 |         run: |
14 |           # Customize those three lines with your repository and credentials:
15 |           REPO=${GITHUB_API_URL}/repos/${{ github.repository }}
16 | 
17 |           # A shortcut to call GitHub API.
18 |           ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; }
19 | 
20 |           # A temporary file which receives HTTP response headers.
21 |           TMPFILE=/tmp/tmp.$$
22 | 
23 |           # An associative array, key: artifact name, value: number of artifacts of that name.
24 |           declare -A ARTCOUNT
25 | 
26 |           # Process all artifacts on this repository, loop on returned "pages".
27 |           URL=$REPO/actions/artifacts
28 |           while [[ -n "$URL" ]]; do
29 | 
30 |             # Get current page, get response headers in a temporary file.
31 |             JSON=$(ghapi --dump-header $TMPFILE "$URL")
32 | 
33 |             # Get URL of next page. Will be empty if we are at the last page.
34 |             URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*<//' -e 's/>.*//')
35 |             rm -f $TMPFILE
36 | 
37 |             # Number of artifacts on this page:
38 |             COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') ))
39 | 
40 |             # Loop on all artifacts on this page.
41 |             for ((i=0; $i < $COUNT; i++)); do
42 | 
43 |               # Get name of artifact and count instances of this name.
44 |               name=$(jq <<<$JSON -r ".artifacts[$i].name?")
45 |               ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1))
46 | 
47 |               id=$(jq <<<$JSON -r ".artifacts[$i].id?")
48 |               size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") ))
49 |               printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size
50 |               ghapi -X DELETE $REPO/actions/artifacts/$id
51 |             done
52 |           done
53 | 


--------------------------------------------------------------------------------
/.github/workflows/dependency-graph.yml:
--------------------------------------------------------------------------------
 1 | name: github-dependency-graph
 2 | 
 3 | on:
 4 |   push:
 5 |     branches:
 6 |       - main
 7 | 
 8 | jobs:
 9 |   submit-dependency-graph:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |     - uses: actions/checkout@v4
13 |     - uses: coursier/cache-action@v6
14 |     - uses: actions/setup-java@v4
15 |       with:
16 |         distribution: 'temurin'
17 |         java-version: '17'
18 |     - uses: ckipp01/mill-dependency-submission@v1
19 | 


--------------------------------------------------------------------------------
/.github/workflows/potential-duplicates.yml:
--------------------------------------------------------------------------------
 1 | name: Potential Duplicates
 2 | on:
 3 |   issues:
 4 |     types: [opened, edited]
 5 | jobs:
 6 |   run:
 7 |     runs-on: ubuntu-latest
 8 |     steps:
 9 |       - uses: wow-actions/potential-duplicates@v1
10 |         with:
11 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
12 |           # Issue title filter work with anymatch https://www.npmjs.com/package/anymatch.
13 |           # Any matched issue will stop detection immediately.
14 |           # You can specify multi filters in each line.
15 |           filter: ''
16 |           # Exclude keywords in title before detecting.
17 |           exclude: ''
18 |           # Label to set, when potential duplicates are detected.
19 |           label: potential-duplicate
20 |           # Get issues with state to compare. Supported state: 'all', 'closed', 'open'.
21 |           state: all
22 |           # If similarity is higher than this threshold([0,1]), issue will be marked as duplicate.
23 |           threshold: 0.6
24 |           # Reactions to be add to comment when potential duplicates are detected.
25 |           # Available reactions: "-1", "+1", "confused", "laugh", "heart", "hooray", "rocket", "eyes"
26 |           reactions: 'eyes, confused'
27 |           # Comment to post when potential duplicates are detected.
28 |           comment: >
29 |             Please check these potential duplicates: {{#issues}}
30 |               - [#{{ number }}] {{ title }} ({{ accuracy }}%)
31 |             {{/issues}}
32 |             
33 |             If this issue is a duplicate, please add any additional info to the ticket with the most information and close this one.
34 | 


--------------------------------------------------------------------------------
/.github/workflows/pr-agent.yaml:
--------------------------------------------------------------------------------
 1 | on:
 2 |   pull_request:
 3 |   issue_comment:
 4 | jobs:
 5 |   pr_agent_job:
 6 |     runs-on: ubuntu-latest
 7 |     permissions:
 8 |       issues: write
 9 |       pull-requests: write
10 |       contents: write
11 |     name: Run pr agent on every pull request, respond to user comments
12 |     steps:
13 |       - name: PR Agent action step
14 |         id: pragent
15 |         uses: Codium-ai/pr-agent@main
16 |         env:
17 |           OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
18 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 | 


--------------------------------------------------------------------------------
/.github/workflows/rebase.yml:
--------------------------------------------------------------------------------
 1 | name: Automatic Rebase
 2 | on:
 3 |   issue_comment:
 4 |     types: [created]
 5 | jobs:
 6 |   rebase:
 7 |     name: Rebase
 8 |     if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/rebase')
 9 |     runs-on: ubuntu-latest
10 |     steps:
11 |       - name: Checkout the latest code
12 |         uses: actions/checkout@v4
13 |         with:
14 |           token: ${{ secrets.GITHUB_TOKEN }}
15 |           fetch-depth: 0 # otherwise, you will fail to push refs to dest repo
16 |       - name: Automatic Rebase
17 |         uses: cirrus-actions/rebase@1.8
18 |         env:
19 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | target/
 2 | project/target/
 3 | project/project/
 4 | out/
 5 | *.p12
 6 | .ensime*
 7 | *.swp
 8 | .idea
 9 | *.log
10 | 
11 | .metals/
12 | project/metals.sbt
13 | **/.bsp/
14 | **/.bloop/
15 | .vscode
16 | private-key.pem
17 | .secrets
18 | .~lock.*.xlsx#
19 | 


--------------------------------------------------------------------------------
/.mill-jvm-opts:
--------------------------------------------------------------------------------
1 | -Xmx4G
2 | 


--------------------------------------------------------------------------------
/.mill-version:
--------------------------------------------------------------------------------
1 | 0.12.14
2 | 


--------------------------------------------------------------------------------
/.scala-steward.conf:
--------------------------------------------------------------------------------
1 | updatePullRequests = "always"
2 | commits.message = "chore: Update ${artifactName} from ${currentVersion} to ${nextVersion}"
3 | 


--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
 1 | version = 3.9.7
 2 | style = default
 3 | runner.dialect=scala212
 4 | maxColumn = 120
 5 | continuationIndent.defnSite = 2
 6 | continuationIndent.callSite = 2
 7 | align.preset = "none"
 8 | danglingParentheses.preset = true
 9 | optIn.configStyleArguments = false
10 | docstrings.style = SpaceAsterisk
11 | spaces.beforeContextBoundColon = true
12 | rewrite.rules = [SortImports]
13 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing
 2 | 
 3 | When contributing to this repository, please first discuss the change you wish to make via an issue
 4 | with the owners of this repository before making a change. 
 5 | 
 6 | ## Pull Request Process
 7 | 
 8 | 1. Unless the changes are trivial extensions or bugfixes,
 9 |     please create an issue proposing what you want to change first.
10 | 2. After coordination with the project maintainers,
11 |     go ahead and create the PR.
12 | 3. If you want to do larger refactorings that are not obviously necessary for the PR
13 |      please coordinate with the project maintainers first.
14 |      We're open to refactorings but would like to discuss and review them independently.
15 | 4. Auto-format your code using `mill mill.scalalib.scalafmt.ScalafmtModule/reformatAll __.sources`.
16 | 5. Run all tests locally using `mill spark-excel[__].test`.
17 | 6. Update the `README.md` and `CHANGELOG.md` with details of changes to the interface.
18 | 7. Rebase your changes to the latest master in case something changed there.
19 | 


--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
 1 | Your issue may already be reported!
 2 | Please search on the [issue track](../) before creating one.
 3 | Moreover, please read the [`CHANGELOG.md`](../../blob/master/CHANGELOG.md) file for any changes you might have missed.
 4 | 
 5 | ## Expected Behavior
 6 | > If you're describing a bug, tell us what should happen
 7 | > If you're suggesting a change/improvement, tell us how it should work
 8 | 
 9 | ## Current Behavior
10 | > If describing a bug, tell us what happens instead of the expected behavior
11 | > If suggesting a change/improvement, explain the difference from current behavior.
12 | > If you have a stack trace or any helpful information from the console, paste it in its entirety.
13 | > If the problem happens with a certain file, upload it somewhere and paste a link.
14 | 
15 | ## Possible Solution
16 | > Not obligatory, but suggest a fix/reason for the bug,
17 | > or ideas how to implement the addition or change
18 | 
19 | ## Steps to Reproduce (for bugs)
20 | > Provide a link to a live example, or an unambiguous set of steps to
21 | > reproduce this bug. Include code to reproduce, if relevant.
22 | > Example:
23 | 1. Download the example file uploaded [here](http://example.com/)
24 | 2. Start Spark from command line as `spark-shell --packages dev.mauch:spark-excel_2.12:x.y.z --foo=bar`
25 | 3. Read the downloaded example file
26 |     ```
27 |     val df = spark.read
28 |     .format("dev.mauch.spark.excel")
29 |     .option("dataAddress", "'My Sheet'!B3:C35")
30 |     .load("example_file_exhibiting_bug.xlsx")
31 |     ```
32 | 
33 | ## Context
34 | > How has this issue affected you? What are you trying to accomplish?
35 | > Providing context helps us come up with a solution that is most useful in the real world
36 | 
37 | ## Your Environment
38 | > Include as many relevant details about the environment you experienced the bug in
39 | * Spark version and language (Scala, Java, Python, R, ...):
40 | * Spark-Excel version:
41 | * Operating System and version, cluster environment, ...:
42 | 


--------------------------------------------------------------------------------
/build.mill:
--------------------------------------------------------------------------------
  1 | import coursier.maven.MavenRepository
  2 | import mill._, scalalib._, publish._
  3 | import Assembly._
  4 | import $ivy.`de.tototec::de.tobiasroeser.mill.vcs.version::0.4.0`
  5 | import de.tobiasroeser.mill.vcs.version.VcsVersion
  6 | 
  7 | trait SparkModule extends Cross.Module2[String, String] with SbtModule with SonatypeCentralPublishModule {
  8 |   outer =>
  9 |   override def scalaVersion = crossValue
 10 |   val sparkVersion = crossValue2
 11 |   val Array(sparkMajor, sparkMinor, sparkPatch) = sparkVersion.split("\\.")
 12 |   val sparkBinaryVersion = s"$sparkMajor.$sparkMinor"
 13 | 
 14 |   override def millSourcePath = super.millSourcePath / os.up
 15 | 
 16 |   object LowerOrEqual {
 17 |     def unapply(otherVersion: String): Boolean = otherVersion match {
 18 |       case s"${sparkMaj}.${sparkMin}.${sparkPat}" =>
 19 |         sparkMaj == sparkMajor && (sparkMin < sparkMinor || (sparkMin == sparkMinor && sparkPat <= sparkPatch))
 20 |       case s"${sparkMaj}.${sparkMin}" => sparkMaj == sparkMajor && sparkMin <= sparkMinor
 21 |       case sparkMaj => sparkMaj == sparkMajor
 22 |     }
 23 |   }
 24 |   object HigherOrEqual {
 25 |     def unapply(otherVersion: String): Boolean = otherVersion match {
 26 |       case s"${sparkMaj}.${sparkMin}.${sparkPat}" =>
 27 |         sparkMaj == sparkMajor && (sparkMin > sparkMinor || (sparkMin == sparkMinor && sparkPat >= sparkPatch))
 28 |       case s"${sparkMaj}.${sparkMin}" => sparkMaj == sparkMajor && sparkMin >= sparkMinor
 29 |       case sparkMaj => sparkMaj == sparkMajor
 30 |     }
 31 |   }
 32 | 
 33 |   def sparkVersionSpecificSources = T {
 34 |     val versionSpecificDirs = os.list(mill.api.WorkspaceRoot.workspaceRoot / "src" / "main")
 35 |     val Array(sparkMajor, sparkMinor, sparkPatch) = sparkVersion.split("\\.")
 36 |     val sparkBinaryVersion = s"$sparkMajor.$sparkMinor"
 37 |     versionSpecificDirs.filter(_.last match {
 38 |       case "scala" => true
 39 |       case `sparkBinaryVersion` => true
 40 |       case s"${LowerOrEqual()}_and_up" => true
 41 |       case s"${LowerOrEqual()}_to_${HigherOrEqual()}" => true
 42 |       case _ => false
 43 |     })
 44 |   }
 45 |   override def sources = T.sources {
 46 |     super.sources() ++ sparkVersionSpecificSources().map(PathRef(_))
 47 |   }
 48 | 
 49 |   override def docSources = T.sources(Seq[PathRef]())
 50 | 
 51 |   override def artifactName = "spark-excel"
 52 | 
 53 |   override def publishVersion: T[String] = T {
 54 |     val vcsVersion = VcsVersion.vcsState().format(untaggedSuffix = "-SNAPSHOT")
 55 |     s"${sparkVersion}_${vcsVersion}"
 56 |   }
 57 |   def pomSettings = PomSettings(
 58 |     description = "A Spark plugin for reading and writing Excel files",
 59 |     organization = "dev.mauch",
 60 |     url = "https://github.com/nightscape/spark-excel",
 61 |     licenses = Seq(License.`Apache-2.0`),
 62 |     versionControl = VersionControl.github("nightscape", "spark-excel"),
 63 |     developers = Seq(Developer("nightscape", "Martin Mauch", "https://github.com/nightscape"))
 64 |   )
 65 | 
 66 |   def assemblyRules = Seq(
 67 |     Rule.AppendPattern(".*\\.conf"), // all *.conf files will be concatenated into single file
 68 |     Rule.Relocate("org.apache.commons.io.**", "shadeio.commons.io.@1"),
 69 |     Rule.Relocate("org.apache.commons.compress.**", "shadeio.commons.compress.@1")
 70 |   )
 71 | 
 72 |   override def extraPublish = Seq(
 73 |     PublishInfo(assembly(), classifier = None, ivyConfig = "compile"),
 74 |     PublishInfo(jar(), classifier = Some("thin"), ivyConfig = "compile")
 75 |   )
 76 | 
 77 |   override def sonatypeCentralReadTimeout: T[Int] = 600000
 78 |   override def sonatypeCentralAwaitTimeout: T[Int] = 1200 * 1000
 79 | 
 80 |   val sparkDeps = Agg(
 81 |     ivy"org.apache.spark::spark-core:$sparkVersion",
 82 |     ivy"org.apache.spark::spark-sql:$sparkVersion",
 83 |     ivy"org.apache.spark::spark-hive:$sparkVersion"
 84 |   )
 85 | 
 86 |   override def compileIvyDeps = if (sparkVersion < "3.3.0") {
 87 |     sparkDeps ++ Agg(ivy"org.slf4j:slf4j-api:1.7.36".excludeOrg("stax"))
 88 |   } else {
 89 |     sparkDeps
 90 |   }
 91 | 
 92 |   val poiVersion = "5.4.1"
 93 | 
 94 |   override def ivyDeps = {
 95 |     val base = Agg(
 96 |       ivy"org.apache.poi:poi:$poiVersion",
 97 |       ivy"org.apache.poi:poi-ooxml:$poiVersion",
 98 |       ivy"org.apache.poi:poi-ooxml-lite:$poiVersion",
 99 |       ivy"org.apache.xmlbeans:xmlbeans:5.3.0",
100 |       ivy"com.norbitltd::spoiwo:2.2.1",
101 |       ivy"com.github.pjfanning:excel-streaming-reader:5.1.0",
102 |       ivy"commons-io:commons-io:2.19.0",
103 |       ivy"org.apache.commons:commons-compress:1.27.1",
104 |       ivy"org.apache.logging.log4j:log4j-api:2.24.3",
105 |       ivy"com.zaxxer:SparseBitSet:1.3",
106 |       ivy"org.apache.commons:commons-collections4:4.5.0",
107 |       ivy"com.github.virtuald:curvesapi:1.08",
108 |       ivy"commons-codec:commons-codec:1.18.0",
109 |       ivy"org.apache.commons:commons-math3:3.6.1",
110 |       ivy"org.scala-lang.modules::scala-collection-compat:2.13.0"
111 |     )
112 |     if (sparkVersion >= "3.3.0") {
113 |       base ++ Agg(ivy"org.apache.logging.log4j:log4j-core:2.24.3")
114 |     } else {
115 |       base
116 |     }
117 |   }
118 | 
119 |   object test extends SbtTests with TestModule.ScalaTest {
120 | 
121 |     override def millSourcePath = super.millSourcePath
122 | 
123 |     override def sources = T.sources {
124 |       Seq(PathRef(millSourcePath / "src" / "test" / "scala"))
125 |     }
126 | 
127 |     override def resources = T.sources {
128 |       Seq(PathRef(millSourcePath / "src" / "test" / "resources"))
129 |     }
130 | 
131 |     def scalaVersion = outer.scalaVersion()
132 | 
133 |     def repositoriesTask = T.task {
134 |       super.repositoriesTask() ++ Seq(MavenRepository("https://jitpack.io"))
135 |     }
136 | 
137 |     def ivyDeps = sparkDeps ++ Agg(
138 |       ivy"org.typelevel::cats-core:2.13.0",
139 |       ivy"org.scalatest::scalatest:3.2.19",
140 |       ivy"org.scalatestplus::scalacheck-1-16:3.2.14.0",
141 |       ivy"org.scalacheck::scalacheck:1.18.1",
142 |       ivy"com.github.alexarchambault::scalacheck-shapeless_1.15:1.3.0",
143 |       ivy"com.github.mrpowers::spark-fast-tests:1.3.0",
144 |       ivy"org.scalamock::scalamock:5.2.0"
145 |     )
146 |   }
147 | 
148 | }
149 | 
150 | val scala213 = "2.13.16"
151 | val scala212 = "2.12.20"
152 | val spark24 = List("2.4.8")
153 | val spark30 = List("3.0.3")
154 | val spark31 = List("3.1.3")
155 | val spark32 = List("3.2.4")
156 | val spark33 = List("3.3.4")
157 | val spark34 = List("3.4.4", "3.4.1")
158 | val spark35 = List("3.5.6")
159 | val sparkVersions = spark24 ++ spark30 ++ spark31 ++ spark32 ++ spark33 ++ spark34 ++ spark35
160 | val crossMatrix =
161 |   sparkVersions.map(spark => (scala212, spark)) ++
162 |     sparkVersions.filter(_ >= "3.2").map(spark => (scala213, spark))
163 | 
164 | object `spark-excel` extends Cross[SparkModule](crossMatrix) {}
165 | 


--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
 1 | # Documentation
 2 | 
 3 | We need help here! Please send us a PR with any examples or documentation you
 4 | care to write that may be of help to others.
 5 | 
 6 | ## Example Notebook
 7 | 
 8 | [spark_excel_examples.ipynb](spark_excel_examples.ipynb) contains examples in
 9 | a notebook format.
10 | 
11 | ## Azure Synapse
12 | 
13 | [azure_synapse.md](azure_synapse.md) has some instructions for loading spark-excel into a
14 | Spark pool for Azure Synapse.


--------------------------------------------------------------------------------
/docs/azure_synapse.md:
--------------------------------------------------------------------------------
 1 | # Azure Synapse
 2 | 
 3 | Adding the spark-excel library to the Spark workspace will enable reading
 4 | and writing of Excel files to an Azure Storage Account.
 5 | 
 6 | At the time of writing, the following libraries have to be added to the
 7 | workspace and then configured for each Spark Pool.
 8 | 
 9 | Each library can be downloaded from [Maven Central](https://search.maven.org)
10 | (thanks Sonatype!).
11 | 
12 | * spark-excel_2.12-3.1.2_0.16.5-pre2.jar
13 | * log4j-core-2.17.2.jar
14 | * log4j-api-2.17.2.jar
15 | * xmlbeans-5.0.3.jar
16 | * poi-ooxml-lite-5.2.2.jar
17 | * commons-collections4-4.4.jar
18 | 
19 | Once those have been applied, the Excel files can be read into a dataframe like so:
20 | 
21 | ```
22 | excel_path = "abfss://<container>@<storage account>.dfs.core.windows.net/<path to excel>" 
23 | df = (spark.read
24 |     .format("excel")
25 |     .load(excel_path)
26 | )
27 | display(df)
28 | ```
29 | 


--------------------------------------------------------------------------------
/private-key.pem.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/private-key.pem.enc


--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
 1 | Spark-excel Source Code Structure
 2 | =================================
 3 | 
 4 | Spark-excel, under the hood, there are two implementations. From spark-excel 0.14.0 we added spark-excel V2, which uses Spark Data Source API V2.
 5 | 
 6 | These two implementations are compatible with each other in terms of options and behavior. However, there are features from spark-excel V2 that are not available in original spark-excel implementation, or example: loading multiple Excel files, corrupted record handling eg.
 7 | 
 8 | Spark DataSource API V2 is still under development, since spark 2.3. And to keep spark-excel V2 code to minimum, spark-excel V2 heavily relies on utilities and improvements of each upstream spark version.
 9 | 
10 | Spark-excel V2 introduces spark-version specific code folder, like:
11 | `2.4/.../spark/v2/excel` for Spark 2.4 Data Source API V2
12 | `3.x/.../spark/v2/excel` for all Spark 3.* Data Source API V2
13 | `3.1_3.2/.../spark/v2/excel` for shared code between Spark 3.1 and Spark 3.2 Data Source API V2
14 | 
15 | These structures are also configured into [build.sc](https://github.dev/mauch/spark-excel/blob/main/build.sc#L13), so it can compile for each Spark version.
16 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/ExcelSparkInternal.scala:
--------------------------------------------------------------------------------
 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
 2 |   *
 3 |   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 4 |   * the License. You may obtain a copy of the License at
 5 |   *
 6 |   * http://www.apache.org/licenses/LICENSE-2.0
 7 |   *
 8 |   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 9 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 |   * specific language governing permissions and limitations under the License.
11 |   */
12 | package org.apache.spark.nightscape
13 | 
14 | import java.nio.file.{Files, Paths}
15 | import org.apache.spark.rdd.InputFileBlockHolder
16 | 
17 | /** To provide input-file-name value. The sole purpose of this is for proxying into spark internal implementation of
18 |   * InputFileBlockHolder
19 |   */
20 | object ExcelSparkInternal {
21 |   def setInputFileName(path: String): Unit = {
22 |     InputFileBlockHolder.set(path, 0, Files.size(Paths.get(path)))
23 |   }
24 | }
25 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
 2 |   *
 3 |   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 4 |   * the License. You may obtain a copy of the License at
 5 |   *
 6 |   * http://www.apache.org/licenses/LICENSE-2.0
 7 |   *
 8 |   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 9 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 |   * specific language governing permissions and limitations under the License.
11 |   */
12 | package dev.mauch.spark.excel.v2
13 | 
14 | import org.apache.spark.unsafe.types.UTF8String
15 | import org.apache.spark.sql.catalyst.util._
16 | import java.time.ZoneId
17 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
18 | 
19 | /** Wrapping the API change between spark 3.0 vs 3.1 */
20 | object ExcelDateTimeStringUtils {
21 |   def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
22 |     val str = UTF8String.fromString(v)
23 |     DateTimeUtils.stringToTimestamp(str, java.util.TimeZone.getTimeZone(zoneId))
24 |   }
25 | 
26 |   def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
27 |     val str = UTF8String.fromString(v)
28 |     DateTimeUtils.stringToDate(str)
29 |   }
30 | 
31 |   def getTimestampFormatter(options: ExcelOptions): TimestampFormatter =
32 |     TimestampFormatter(options.timestampFormat, java.util.TimeZone.getTimeZone(options.zoneId), options.locale)
33 | 
34 |   def getDateFormatter(options: ExcelOptions): DateFormatter =
35 |     DateFormatter(options.dateFormat, options.locale)
36 | }
37 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala:
--------------------------------------------------------------------------------
 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
 2 |   *
 3 |   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 4 |   * the License. You may obtain a copy of the License at
 5 |   *
 6 |   * http://www.apache.org/licenses/LICENSE-2.0
 7 |   *
 8 |   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 9 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 |   * specific language governing permissions and limitations under the License.
11 |   */
12 | package dev.mauch.spark.excel.v2
13 | 
14 | import org.apache.spark.sql.sources
15 | import org.apache.spark.sql.types.StructType
16 | import org.apache.spark.sql.catalyst.InternalRow
17 | 
18 | /** Wrapping the API change between spark 3.0 vs 3.1 */
19 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType) {
20 |   def skipRow(row: InternalRow, index: Int): Boolean = { false }
21 | }
22 | 
23 | object ExcelFilters {
24 |   def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] =
25 |     filters
26 | }
27 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
20 | import org.apache.spark.sql.internal.SQLConf
21 | 
22 | class ExcelOptions(
23 |   @transient
24 |   val parameters: CaseInsensitiveMap[String],
25 |   val defaultTimeZoneId: String,
26 |   val defaultColumnNameOfCorruptRecord: String
27 | ) extends ExcelOptionsTrait
28 |     with Serializable {
29 |   // all parameter handling is implemented in ExcelOptionsTrait
30 | 
31 |   def this(parameters: Map[String, String], defaultTimeZoneId: String) = {
32 |     this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord)
33 |   }
34 | 
35 |   def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = {
36 |     this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord)
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException
20 | import org.apache.spark.unsafe.types.UTF8String
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | 
23 | trait ExcelParserBase {
24 | 
25 |   protected def getCurrentInput: UTF8String
26 |   def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException =
27 |     BadRecordException(() => getCurrentInput, () => partialResults.headOption, baseException)
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/FailureSafeParser.scala:
--------------------------------------------------------------------------------
 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
 2 |   *
 3 |   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 4 |   * the License. You may obtain a copy of the License at
 5 |   *
 6 |   * http://www.apache.org/licenses/LICENSE-2.0
 7 |   *
 8 |   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 9 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 |   * specific language governing permissions and limitations under the License.
11 |   */
12 | package dev.mauch.spark.excel.v2
13 | 
14 | import org.apache.spark.SparkException
15 | import org.apache.spark.sql.catalyst.InternalRow
16 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
17 | import org.apache.spark.sql.types.StructType
18 | import org.apache.spark.unsafe.types.UTF8String
19 | import org.apache.spark.sql.catalyst.util._
20 | 
21 | class FailureSafeParser[IN](
22 |   rawParser: IN => Iterable[InternalRow],
23 |   mode: ParseMode,
24 |   schema: StructType,
25 |   columnNameOfCorruptRecord: String
26 | ) {
27 | 
28 |   private val corruptFieldIndex =
29 |     if (schema.fieldNames.contains(columnNameOfCorruptRecord)) {
30 |       Some(schema.fieldIndex(columnNameOfCorruptRecord))
31 |     } else None
32 | 
33 |   private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
34 |   private val resultRow = new GenericInternalRow(schema.length)
35 |   private val nullResult = new GenericInternalRow(schema.length)
36 | 
37 |   // This function takes 2 parameters: an optional partial result, and the bad record. If the given
38 |   // schema doesn't contain a field for corrupted record, we just return the partial result or a
39 |   // row with all fields null. If the given schema contains a field for corrupted record, we will
40 |   // set the bad record to this field, and set other fields according to the partial result or null.
41 |   private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
42 |     if (corruptFieldIndex.isDefined) { (row, badRecord) =>
43 |       {
44 |         var i = 0
45 |         while (i < actualSchema.length) {
46 |           val from = actualSchema(i)
47 |           resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
48 |           i += 1
49 |         }
50 |         resultRow(corruptFieldIndex.get) = badRecord()
51 |         resultRow
52 |       }
53 |     } else { (row, _) => row.getOrElse(nullResult) }
54 |   }
55 | 
56 |   def parse(input: IN): Iterator[InternalRow] = {
57 |     try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) }
58 |     catch {
59 |       case e: BadRecordException =>
60 |         mode match {
61 |           case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record))
62 |           case DropMalformedMode => Iterator.empty
63 |           case FailFastMode =>
64 |             throw new SparkException(
65 |               "Malformed records are detected in record parsing. " +
66 |                 s"Parse Mode: ${FailFastMode.name}. To process malformed records as null " +
67 |                 "result, try setting the option 'mode' as 'PERMISSIVE'.",
68 |               e
69 |             )
70 |         }
71 |     }
72 |   }
73 | }
74 | 


--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/SchemaUtils.scala:
--------------------------------------------------------------------------------
 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
 2 |   *
 3 |   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 4 |   * the License. You may obtain a copy of the License at
 5 |   *
 6 |   * http://www.apache.org/licenses/LICENSE-2.0
 7 |   *
 8 |   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 9 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 |   * specific language governing permissions and limitations under the License.
11 |   */
12 | package dev.mauch.spark.excel.v2
13 | 
14 | import org.apache.spark.sql.catalyst.analysis._
15 | import org.apache.spark.sql.types.StructType
16 | 
17 | /** Utils for handling schemas. (Copied from spark.util)
18 |   */
19 | object SchemaUtils {
20 | 
21 |   /** Checks if an input schema has duplicate column names. This throws an exception if the duplication exists.
22 |     *
23 |     * @param schema
24 |     *   schema to check
25 |     * @param colType
26 |     *   column type name, used in an exception message
27 |     * @param caseSensitiveAnalysis
28 |     *   whether duplication checks should be case sensitive or not
29 |     */
30 |   def checkSchemaColumnNameDuplication(
31 |     schema: StructType,
32 |     colType: String,
33 |     caseSensitiveAnalysis: Boolean = false
34 |   ): Unit = { checkColumnNameDuplication(schema.map(_.name), colType, caseSensitiveAnalysis) }
35 | 
36 |   // Returns true if a given resolver is case-sensitive
37 |   private def isCaseSensitiveAnalysis(resolver: Resolver): Boolean = {
38 |     if (resolver == caseSensitiveResolution) { true }
39 |     else if (resolver == caseInsensitiveResolution) { false }
40 |     else {
41 |       sys.error(
42 |         "A resolver to check if two identifiers are equal must be " +
43 |           "`caseSensitiveResolution` or `caseInsensitiveResolution` in o.a.s.sql.catalyst."
44 |       )
45 |     }
46 |   }
47 | 
48 |   /** Checks if input column names have duplicate identifiers. This throws an exception if the duplication exists.
49 |     *
50 |     * @param columnNames
51 |     *   column names to check
52 |     * @param colType
53 |     *   column type name, used in an exception message
54 |     * @param resolver
55 |     *   resolver used to determine if two identifiers are equal
56 |     */
57 |   def checkColumnNameDuplication(columnNames: Seq[String], colType: String, resolver: Resolver): Unit = {
58 |     checkColumnNameDuplication(columnNames, colType, isCaseSensitiveAnalysis(resolver))
59 |   }
60 | 
61 |   /** Checks if input column names have duplicate identifiers. This throws an exception if the duplication exists.
62 |     *
63 |     * @param columnNames
64 |     *   column names to check
65 |     * @param colType
66 |     *   column type name, used in an exception message
67 |     * @param caseSensitiveAnalysis
68 |     *   whether duplication checks should be case sensitive or not
69 |     */
70 |   def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
71 |     val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
72 |     if (names.distinct.length != names.length) {
73 |       val duplicateColumns = names
74 |         .groupBy(identity)
75 |         .collect { case (x, ys) if ys.length > 1 => s"`$x`" }
76 |       throw new RuntimeException(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
77 |     }
78 |   }
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/3.0/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.unsafe.types.UTF8String
20 | import org.apache.spark.sql.catalyst.util._
21 | import java.time.ZoneId
22 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
23 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
24 | 
25 | /** Wrapping the API change between spark 3.0 vs 3.1 */
26 | object ExcelDateTimeStringUtils {
27 |   def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
28 |     val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(v))
29 |     DateTimeUtils.stringToTimestamp(str, zoneId)
30 |   }
31 | 
32 |   def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
33 |     val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(v))
34 |     DateTimeUtils.stringToDate(str, zoneId)
35 |   }
36 | 
37 |   def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter(
38 |     options.timestampFormat,
39 |     options.zoneId,
40 |     options.locale,
41 |     legacyFormat = FAST_DATE_FORMAT,
42 |     isParsing = true
43 |   )
44 | 
45 |   def getDateFormatter(options: ExcelOptions): DateFormatter =
46 |     DateFormatter(options.dateFormat, options.zoneId, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true)
47 | }
48 | 


--------------------------------------------------------------------------------
/src/main/3.0/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.catalyst.csv.CSVFilters
20 | import org.apache.spark.sql.sources
21 | import org.apache.spark.sql.types.StructType
22 | 
23 | /** Wrapping the API change between spark 3.0 vs 3.1 */
24 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType)
25 |     extends CSVFilters(filters, requiredSchema) {}
26 | 
27 | object ExcelFilters {
28 |   def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] =
29 |     CSVFilters.pushedFilters(filters, schema)
30 | }
31 | 


--------------------------------------------------------------------------------
/src/main/3.0_and_up/scala/dev/mauch/spark/excel/v2/ExcelDataSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.connector.catalog.Table
20 | import org.apache.spark.sql.execution.datasources.FileFormat
21 | import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2
22 | import org.apache.spark.sql.types.StructType
23 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
24 | 
25 | /** Derived from Spark own CSV implementation
26 |   */
27 | class ExcelDataSource extends FileDataSourceV2 {
28 | 
29 |   override def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ExcelFileFormat]
30 | 
31 |   override def getTable(options: CaseInsensitiveStringMap): Table = {
32 |     val paths = getPaths(options)
33 |     val tableName = getTableName(options, paths)
34 |     val optionsWithoutPaths = getOptionsWithoutPaths(options)
35 |     ExcelTable(tableName, sparkSession, optionsWithoutPaths, paths, None)
36 |   }
37 | 
38 |   override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
39 |     val paths = getPaths(options)
40 |     val tableName = getTableName(options, paths)
41 |     val optionsWithoutPaths = getOptionsWithoutPaths(options)
42 |     ExcelTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema))
43 |   }
44 | 
45 |   /** The string that represents the format that this data source provider uses
46 |     */
47 |   override def shortName(): String = "excel"
48 | 
49 | }
50 | 


--------------------------------------------------------------------------------
/src/main/3.0_and_up/scala/dev/mauch/spark/excel/v2/ExcelFileFormat.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.hadoop.fs.{FileStatus, Path}
21 | import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
22 | import org.apache.spark.sql.SparkSession
23 | import org.apache.spark.sql.catalyst.InternalRow
24 | import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory, PartitionedFile}
25 | import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
26 | import org.apache.spark.sql.types._
27 | 
28 | /** derived from binary file data source. Needed to support writing excel using the V2 API
29 |   */
30 | class ExcelFileFormat extends FileFormat with DataSourceRegister {
31 | 
32 |   override def inferSchema(
33 |     sparkSession: SparkSession,
34 |     options: Map[String, String],
35 |     files: Seq[FileStatus]
36 |   ): Option[StructType] = {
37 |     throw new UnsupportedOperationException("ExcelFileFormat as fallback format for V2 supports writing only")
38 |   }
39 | 
40 |   override def prepareWrite(
41 |     sparkSession: SparkSession,
42 |     job: Job,
43 |     options: Map[String, String],
44 |     dataSchema: StructType
45 |   ): OutputWriterFactory = {
46 |     val excelOptions = new ExcelOptions(options, sparkSession.conf.get("spark.sql.session.timeZone"))
47 | 
48 |     new OutputWriterFactory {
49 |       override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
50 |         new ExcelOutputWriter(path, dataSchema, context, excelOptions)
51 |       }
52 | 
53 |       override def getFileExtension(context: TaskAttemptContext): String =
54 |         s".${excelOptions.fileExtension}"
55 |     }
56 |   }
57 | 
58 |   override def isSplitable(sparkSession: SparkSession, options: Map[String, String], path: Path): Boolean = {
59 |     false
60 |   }
61 | 
62 |   override def shortName(): String = "excel"
63 | 
64 |   /*
65 |   We need this class for writing only, thus reader is not implemented
66 |    */
67 |   override protected def buildReader(
68 |     sparkSession: SparkSession,
69 |     dataSchema: StructType,
70 |     partitionSchema: StructType,
71 |     requiredSchema: StructType,
72 |     filters: Seq[Filter],
73 |     options: Map[String, String],
74 |     hadoopConf: Configuration
75 |   ): PartitionedFile => Iterator[InternalRow] = {
76 |     throw new UnsupportedOperationException("ExcelFileFormat as fallback format for V2 supports writing only")
77 |   }
78 | 
79 | }
80 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelOutputWriter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.mapreduce.TaskAttemptContext
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | class ExcelOutputWriter(path: String, dataSchema: StructType, context: TaskAttemptContext, options: ExcelOptions)
26 |     extends OutputWriter
27 |     with Logging {
28 | 
29 |   private val gen = new ExcelGenerator(path, dataSchema, context.getConfiguration, options)
30 |   if (options.header) { gen.writeHeaders() }
31 | 
32 |   override def write(row: InternalRow): Unit = gen.write(row)
33 | 
34 |   override def close(): Unit = gen.close()
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelTable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.fs.FileStatus
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
22 | import org.apache.spark.sql.connector.write.WriteBuilder
23 | import org.apache.spark.sql.execution.datasources.FileFormat
24 | import org.apache.spark.sql.execution.datasources.v2.FileTable
25 | import org.apache.spark.sql.types.DataType
26 | import org.apache.spark.sql.types.StructType
27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
28 | import org.apache.spark.sql.connector.catalog.TableCapability
29 | import org.apache.spark.sql.connector.catalog.TableCapability._
30 | import scala.jdk.CollectionConverters._
31 | 
32 | case class ExcelTable(
33 |   name: String,
34 |   sparkSession: SparkSession,
35 |   map: CaseInsensitiveStringMap,
36 |   paths: Seq[String],
37 |   userSpecifiedSchema: Option[StructType]
38 | ) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) {
39 | 
40 |   override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder =
41 |     ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params)
42 | 
43 |   override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
44 |     val options =
45 |       new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone)
46 | 
47 |     if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty))
48 |   }
49 | 
50 |   override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
51 |     new ExcelWriteBuilder(paths, formatName, supportsDataType, info)
52 | 
53 |   override def supportsDataType(dataType: DataType): Boolean = true
54 | 
55 |   override def formatName: String = "Excel"
56 | 
57 |   override def fallbackFileFormat: Class[_ <: FileFormat] =
58 |     throw new UnsupportedOperationException("Excel does not support V1 File Format")
59 | 
60 |   override def capabilities: java.util.Set[TableCapability] =
61 |     Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava
62 | 
63 |   /* Actual doing schema inferring */
64 |   private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = {
65 |     val excelHelper = ExcelHelper(options)
66 |     val conf = sparkSession.sessionState.newHadoopConf()
67 | 
68 |     /** Sampling ratio on file level (not row level as in CSV) */
69 |     val paths = {
70 |       var sample = (inputPaths.size * options.samplingRatio).intValue
71 |       sample = if (sample < 1) 1 else sample
72 |       inputPaths.take(sample).map(_.getPath.toUri)
73 |     }
74 |     val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths)
75 |     try {
76 |       if (sheetData.rowIterator.isEmpty) {
77 |         /* If the first file is empty, not checking further */
78 |         StructType(Seq.empty)
79 |       } else {
80 |         /* Ready to infer schema */
81 |         ExcelInferSchema(options).infer(sheetData.rowIterator, colNames)
82 |       }
83 |     } finally {
84 |       sheetData.close()
85 |     }
86 |   }
87 | }
88 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelWriteBuilder.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.mapreduce.Job
20 | import org.apache.hadoop.mapreduce.TaskAttemptContext
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.execution.datasources.OutputWriterFactory
24 | import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder
25 | import org.apache.spark.sql.internal.SQLConf
26 | import org.apache.spark.sql.types.DataType
27 | import org.apache.spark.sql.types.StructType
28 | 
29 | class ExcelWriteBuilder(
30 |   paths: Seq[String],
31 |   formatName: String,
32 |   supportsDataType: DataType => Boolean,
33 |   info: LogicalWriteInfo
34 | ) extends FileWriteBuilder(paths, formatName, supportsDataType, info) {
35 |   override def prepareWrite(
36 |     sqlConf: SQLConf,
37 |     job: Job,
38 |     options: Map[String, String],
39 |     dataSchema: StructType
40 |   ): OutputWriterFactory = {
41 | 
42 |     val excelOptions = new ExcelOptions(options, sqlConf.sessionLocalTimeZone)
43 | 
44 |     new OutputWriterFactory {
45 |       override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
46 |         new ExcelOutputWriter(path, dataSchema, context, excelOptions)
47 |       }
48 | 
49 |       override def getFileExtension(context: TaskAttemptContext): String =
50 |         s".${excelOptions.fileExtension}"
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.2/scala/dev/mauch/spark/excel/v2/ExcelScan.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import org.apache.hadoop.fs.Path
 20 | import org.apache.spark.sql.SparkSession
 21 | import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression}
 22 | import org.apache.spark.sql.connector.read.PartitionReaderFactory
 23 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
 24 | import org.apache.spark.sql.execution.datasources.v2.{FileScan, TextBasedFileScan}
 25 | import org.apache.spark.sql.sources.Filter
 26 | import org.apache.spark.sql.types.StructType
 27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 28 | import org.apache.spark.util.SerializableConfiguration
 29 | 
 30 | import scala.collection.compat.immutable.ArraySeq
 31 | import scala.jdk.CollectionConverters._
 32 | 
 33 | case class ExcelScan(
 34 |   sparkSession: SparkSession,
 35 |   fileIndex: PartitioningAwareFileIndex,
 36 |   dataSchema: StructType,
 37 |   readDataSchema: StructType,
 38 |   readPartitionSchema: StructType,
 39 |   options: CaseInsensitiveStringMap,
 40 |   pushedFilters: Array[Filter],
 41 |   partitionFilters: Seq[Expression] = Seq.empty,
 42 |   dataFilters: Seq[Expression] = Seq.empty
 43 | ) extends TextBasedFileScan(sparkSession, options) {
 44 | 
 45 |   private lazy val parsedOptions: ExcelOptions = new ExcelOptions(
 46 |     options.asScala.toMap,
 47 |     sparkSession.sessionState.conf.sessionLocalTimeZone,
 48 |     sparkSession.sessionState.conf.columnNameOfCorruptRecord
 49 |   )
 50 | 
 51 |   override def isSplitable(path: Path): Boolean = false
 52 | 
 53 |   override def getFileUnSplittableReason(path: Path): String = {
 54 |     "No practical method of splitting an excel file"
 55 |   }
 56 | 
 57 |   override def createReaderFactory(): PartitionReaderFactory = {
 58 | 
 59 |     /* Check a field requirement for corrupt records here to throw an exception in a driver side
 60 |      */
 61 |     ExprUtils.verifyColumnNameOfCorruptRecord(dataSchema, parsedOptions.columnNameOfCorruptRecord)
 62 | 
 63 |     if (
 64 |       readDataSchema.length == 1 &&
 65 |       readDataSchema.head.name == parsedOptions.columnNameOfCorruptRecord
 66 |     ) {
 67 |       throw new RuntimeException(
 68 |         "Queries from raw Excel files are disallowed when the referenced " +
 69 |           "columns only include the internal corrupt record column"
 70 |       )
 71 |     }
 72 | 
 73 |     val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
 74 | 
 75 |     /* Hadoop Configurations are case sensitive. */
 76 |     val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
 77 | 
 78 |     val broadcastedConf = sparkSession.sparkContext
 79 |       .broadcast(new SerializableConfiguration(hadoopConf))
 80 | 
 81 |     /* The partition values are already truncated in `FileScan.partitions`. We should use `readPartitionSchema` as the
 82 |      * partition schema here.
 83 |      */
 84 |     ExcelPartitionReaderFactory(
 85 |       sparkSession.sessionState.conf,
 86 |       broadcastedConf,
 87 |       dataSchema,
 88 |       readDataSchema,
 89 |       readPartitionSchema,
 90 |       parsedOptions,
 91 |       ArraySeq.unsafeWrapArray(pushedFilters)
 92 |     )
 93 |   }
 94 | 
 95 |   override def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan =
 96 |     this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
 97 | 
 98 |   override def equals(obj: Any): Boolean = obj match {
 99 |     case c: ExcelScan =>
100 |       super.equals(c) && dataSchema == c.dataSchema && options == c.options &&
101 |       equivalentFilters(pushedFilters, c.pushedFilters)
102 |     case _ => false
103 |   }
104 | 
105 |   override def hashCode(): Int = super.hashCode()
106 | 
107 |   override def description(): String = {
108 |     super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]")
109 |   }
110 | }
111 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.2/scala/dev/mauch/spark/excel/v2/ExcelScanBuilder.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters}
21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
22 | import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
23 | import org.apache.spark.sql.sources.Filter
24 | import org.apache.spark.sql.types.StructType
25 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
26 | 
27 | case class ExcelScanBuilder(
28 |   sparkSession: SparkSession,
29 |   fileIndex: PartitioningAwareFileIndex,
30 |   schema: StructType,
31 |   dataSchema: StructType,
32 |   options: CaseInsensitiveStringMap
33 | ) extends FileScanBuilder(sparkSession, fileIndex, dataSchema)
34 |     with SupportsPushDownFilters {
35 | 
36 |   override def build(): Scan = {
37 |     ExcelScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedFilters())
38 |   }
39 | 
40 |   private var _pushedFilters: Array[Filter] = Array.empty
41 | 
42 |   override def pushFilters(filters: Array[Filter]): Array[Filter] = {
43 |     _pushedFilters = ExcelFilters.pushedFilters(filters, dataSchema)
44 |     filters
45 |   }
46 | 
47 |   override def pushedFilters(): Array[Filter] = _pushedFilters
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.3/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
20 | import org.apache.spark.sql.internal.SQLConf
21 | 
22 | class ExcelOptions(
23 |   @transient
24 |   val parameters: CaseInsensitiveMap[String],
25 |   val defaultTimeZoneId: String,
26 |   val defaultColumnNameOfCorruptRecord: String
27 | ) extends ExcelOptionsTrait
28 |     with Serializable {
29 |   // all parameter handling is implemented in ExcelOptionsTrait
30 | 
31 |   def this(parameters: Map[String, String], defaultTimeZoneId: String) = {
32 |     this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord)
33 |   }
34 | 
35 |   def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = {
36 |     this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord)
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.4.1/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException
20 | import org.apache.spark.unsafe.types.UTF8String
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | 
23 | trait ExcelParserBase {
24 | 
25 |   protected def getCurrentInput: UTF8String
26 |   def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException =
27 |     BadRecordException(() => getCurrentInput, () => partialResults.headOption, baseException)
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/3.0_to_3.4.1/scala/dev/mauch/spark/excel/v2/ExcelPartitionReaderFactory.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import org.apache.hadoop.conf.Configuration
 20 | import org.apache.spark.broadcast.Broadcast
 21 | import org.apache.spark.sql.catalyst.InternalRow
 22 | import org.apache.spark.sql.connector.read.PartitionReader
 23 | import org.apache.spark.sql.execution.datasources.PartitionedFile
 24 | import org.apache.spark.sql.execution.datasources.v2._
 25 | import org.apache.spark.sql.internal.SQLConf
 26 | import org.apache.spark.sql.sources.Filter
 27 | import org.apache.spark.sql.types.StructType
 28 | import org.apache.spark.util.SerializableConfiguration
 29 | 
 30 | import java.net.URI
 31 | import scala.util.control.NonFatal
 32 | 
 33 | /** A factory used to create Excel readers.
 34 |   *
 35 |   * @param sqlConf
 36 |   *   SQL configuration.
 37 |   * @param broadcastedConf
 38 |   *   Broadcasted serializable Hadoop Configuration.
 39 |   * @param dataSchema
 40 |   *   Schema of Excel files.
 41 |   * @param readDataSchema
 42 |   *   Required data schema in the batch scan.
 43 |   * @param partitionSchema
 44 |   *   Schema of partitions.
 45 |   * @param options
 46 |   *   Options for parsing Excel files.
 47 |   */
 48 | case class ExcelPartitionReaderFactory(
 49 |   sqlConf: SQLConf,
 50 |   broadcastedConf: Broadcast[SerializableConfiguration],
 51 |   dataSchema: StructType,
 52 |   readDataSchema: StructType,
 53 |   partitionSchema: StructType,
 54 |   options: ExcelOptions,
 55 |   filters: Seq[Filter]
 56 | ) extends FilePartitionReaderFactory {
 57 | 
 58 |   override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = {
 59 |     val conf = broadcastedConf.value.value
 60 |     val actualDataSchema =
 61 |       StructType(dataSchema.filterNot(_.name == options.columnNameOfCorruptRecord))
 62 |     val actualReadDataSchema =
 63 |       StructType(readDataSchema.filterNot(_.name == options.columnNameOfCorruptRecord))
 64 |     val parser = new ExcelParser(actualDataSchema, actualReadDataSchema, options, filters)
 65 |     val headerChecker =
 66 |       new ExcelHeaderChecker(actualReadDataSchema, options, source = s"Excel file: ${file.filePath}")
 67 |     val iter = readFile(conf, file, parser, headerChecker, readDataSchema)
 68 |     val partitionReader = new SparkExcelPartitionReaderFromIterator(iter)
 69 |     new PartitionReaderWithPartitionValues(partitionReader, readDataSchema, partitionSchema, file.partitionValues)
 70 |   }
 71 | 
 72 |   private def readFile(
 73 |     conf: Configuration,
 74 |     file: PartitionedFile,
 75 |     parser: ExcelParser,
 76 |     headerChecker: ExcelHeaderChecker,
 77 |     requiredSchema: StructType
 78 |   ): SheetData[InternalRow] = {
 79 |     val excelHelper = ExcelHelper(options)
 80 |     val sheetData = excelHelper.getSheetData(conf, URI.create(file.filePath.toString))
 81 |     try {
 82 |       SheetData(
 83 |         ExcelParser.parseIterator(sheetData.rowIterator, parser, headerChecker, requiredSchema),
 84 |         sheetData.resourcesToClose
 85 |       )
 86 |     } catch {
 87 |       case NonFatal(t) => {
 88 |         sheetData.close()
 89 |         throw t
 90 |       }
 91 |     }
 92 |   }
 93 | 
 94 | }
 95 | 
 96 | private class SparkExcelPartitionReaderFromIterator(sheetData: SheetData[InternalRow])
 97 |     extends PartitionReaderFromIterator[InternalRow](sheetData.rowIterator) {
 98 |   override def close(): Unit = {
 99 |     super.close()
100 |     sheetData.close()
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/main/3.1/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
 1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
 2 |   *
 3 |   * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
 4 |   * the License. You may obtain a copy of the License at
 5 |   *
 6 |   * http://www.apache.org/licenses/LICENSE-2.0
 7 |   *
 8 |   * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
 9 |   * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 |   * specific language governing permissions and limitations under the License.
11 |   */
12 | package dev.mauch.spark.excel.v2
13 | 
14 | import org.apache.spark.unsafe.types.UTF8String
15 | import org.apache.spark.sql.catalyst.util._
16 | import java.time.ZoneId
17 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
18 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
19 | 
20 | object ExcelDateTimeStringUtils {
21 |   def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
22 |     val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
23 |     DateTimeUtils.stringToTimestamp(str, zoneId)
24 |   }
25 | 
26 |   def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
27 |     val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
28 |     DateTimeUtils.stringToDate(str, zoneId)
29 |   }
30 | 
31 |   def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter(
32 |     options.timestampFormat,
33 |     options.zoneId,
34 |     options.locale,
35 |     legacyFormat = FAST_DATE_FORMAT,
36 |     isParsing = true
37 |   )
38 | 
39 |   def getDateFormatter(options: ExcelOptions): DateFormatter =
40 |     DateFormatter(options.dateFormat, options.zoneId, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true)
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/src/main/3.1_and_up/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.catalyst.OrderedFilters
20 | import org.apache.spark.sql.catalyst.StructFilters
21 | import org.apache.spark.sql.sources
22 | import org.apache.spark.sql.types.StructType
23 | 
24 | /** Wrapping the API change between spark 3.0 vs 3.1 */
25 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType)
26 |     extends OrderedFilters(filters, requiredSchema) {}
27 | 
28 | object ExcelFilters {
29 |   def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] =
30 |     StructFilters.pushedFilters(filters, schema)
31 | }
32 | 


--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.unsafe.types.UTF8String
20 | import org.apache.spark.sql.catalyst.util._
21 | 
22 | import java.time.ZoneId
23 | import org.apache.spark.sql.catalyst.util.DateFormatter
24 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
25 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
26 | 
27 | import scala.annotation.nowarn
28 | 
29 | object ExcelDateTimeStringUtils {
30 |   def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
31 |     val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
32 |     DateTimeUtils.stringToTimestamp(str, zoneId)
33 |   }
34 | 
35 |   @nowarn
36 |   def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
37 |     val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
38 |     DateTimeUtils.stringToDate(str)
39 |   }
40 | 
41 |   def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter(
42 |     options.timestampFormat,
43 |     options.zoneId,
44 |     options.locale,
45 |     legacyFormat = FAST_DATE_FORMAT,
46 |     isParsing = true
47 |   )
48 | 
49 |   def getDateFormatter(options: ExcelOptions): DateFormatter =
50 |     DateFormatter(options.dateFormat, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true)
51 | 
52 | }
53 | 


--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelOutputWriter.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.mapreduce.TaskAttemptContext
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.types.StructType
24 | 
25 | class ExcelOutputWriter(val path: String, dataSchema: StructType, context: TaskAttemptContext, options: ExcelOptions)
26 |     extends OutputWriter
27 |     with Logging {
28 | 
29 |   private val gen = new ExcelGenerator(path, dataSchema, context.getConfiguration, options)
30 |   if (options.header) { gen.writeHeaders() }
31 | 
32 |   override def write(row: InternalRow): Unit = gen.write(row)
33 | 
34 |   override def close(): Unit = gen.close()
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelTable.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.fs.FileStatus
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.connector.write.Write
22 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
23 | import org.apache.spark.sql.connector.write.WriteBuilder
24 | import org.apache.spark.sql.execution.datasources.FileFormat
25 | import org.apache.spark.sql.execution.datasources.v2.FileTable
26 | import org.apache.spark.sql.types.DataType
27 | import org.apache.spark.sql.types.StructType
28 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
29 | import org.apache.spark.sql.connector.catalog.TableCapability
30 | import org.apache.spark.sql.connector.catalog.TableCapability._
31 | import scala.jdk.CollectionConverters._
32 | 
33 | case class ExcelTable(
34 |   name: String,
35 |   sparkSession: SparkSession,
36 |   map: CaseInsensitiveStringMap,
37 |   paths: Seq[String],
38 |   userSpecifiedSchema: Option[StructType]
39 | ) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) {
40 | 
41 |   override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder =
42 |     ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params)
43 | 
44 |   override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
45 |     val options =
46 |       new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone)
47 | 
48 |     if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty))
49 |   }
50 | 
51 |   override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
52 |     new WriteBuilder {
53 |       override def build(): Write = ExcelWriteBuilder(paths, formatName, supportsDataType, info)
54 |     }
55 | 
56 |   override def supportsDataType(dataType: DataType): Boolean = true
57 | 
58 |   override def formatName: String = "Excel"
59 | 
60 |   override def fallbackFileFormat: Class[_ <: FileFormat] =
61 |     throw new UnsupportedOperationException("Excel does not support V1 File Format")
62 | 
63 |   override def capabilities: java.util.Set[TableCapability] =
64 |     Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava
65 | 
66 |   /* Actual doing schema inferring */
67 |   private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = {
68 |     val excelHelper = ExcelHelper(options)
69 |     val conf = sparkSession.sessionState.newHadoopConf()
70 | 
71 |     /* Sampling ratio on file level (not row level as in CSV) */
72 |     val paths = {
73 |       var sample = (inputPaths.size * options.samplingRatio).intValue
74 |       sample = if (sample < 1) 1 else sample
75 |       inputPaths.take(sample).map(_.getPath.toUri)
76 |     }
77 |     val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths)
78 |     try {
79 |       if (sheetData.rowIterator.isEmpty) {
80 |         /* If the first file is empty, not checking further */
81 |         StructType(Seq.empty)
82 |       } else {
83 |         /* Ready to infer schema */
84 |         ExcelInferSchema(options).infer(sheetData.rowIterator, colNames)
85 |       }
86 |     } finally {
87 |       sheetData.close()
88 |     }
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelWriteBuilder.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.hadoop.mapreduce.Job
20 | import org.apache.hadoop.mapreduce.TaskAttemptContext
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.execution.datasources.OutputWriterFactory
24 | import org.apache.spark.sql.execution.datasources.v2.FileWrite
25 | import org.apache.spark.sql.internal.SQLConf
26 | import org.apache.spark.sql.types.DataType
27 | import org.apache.spark.sql.types.StructType
28 | 
29 | case class ExcelWriteBuilder(
30 |   paths: Seq[String],
31 |   formatName: String,
32 |   supportsDataType: DataType => Boolean,
33 |   info: LogicalWriteInfo
34 | ) extends FileWrite {
35 |   override def prepareWrite(
36 |     sqlConf: SQLConf,
37 |     job: Job,
38 |     options: Map[String, String],
39 |     dataSchema: StructType
40 |   ): OutputWriterFactory = {
41 | 
42 |     val excelOptions = new ExcelOptions(options, sqlConf.sessionLocalTimeZone)
43 | 
44 |     new OutputWriterFactory {
45 |       override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
46 |         new ExcelOutputWriter(path, dataSchema, context, excelOptions)
47 |       }
48 | 
49 |       override def getFileExtension(context: TaskAttemptContext): String =
50 |         s".${excelOptions.fileExtension}"
51 |     }
52 |   }
53 | }
54 | 


--------------------------------------------------------------------------------
/src/main/3.3_and_up/scala/dev/mauch/spark/excel/v2/ExcelScan.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import org.apache.hadoop.fs.Path
 20 | import org.apache.spark.sql.SparkSession
 21 | import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression}
 22 | import org.apache.spark.sql.connector.read.PartitionReaderFactory
 23 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
 24 | import org.apache.spark.sql.execution.datasources.v2.TextBasedFileScan
 25 | import org.apache.spark.sql.sources.Filter
 26 | import org.apache.spark.sql.types.StructType
 27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
 28 | import org.apache.spark.util.SerializableConfiguration
 29 | 
 30 | import scala.collection.compat.immutable.ArraySeq
 31 | import scala.jdk.CollectionConverters._
 32 | 
 33 | case class ExcelScan(
 34 |   sparkSession: SparkSession,
 35 |   fileIndex: PartitioningAwareFileIndex,
 36 |   dataSchema: StructType,
 37 |   readDataSchema: StructType,
 38 |   readPartitionSchema: StructType,
 39 |   options: CaseInsensitiveStringMap,
 40 |   pushedFilters: Array[Filter],
 41 |   partitionFilters: Seq[Expression] = Seq.empty,
 42 |   dataFilters: Seq[Expression] = Seq.empty
 43 | ) extends TextBasedFileScan(sparkSession, options) {
 44 | 
 45 |   private lazy val parsedOptions: ExcelOptions = new ExcelOptions(
 46 |     options.asScala.toMap,
 47 |     sparkSession.sessionState.conf.sessionLocalTimeZone,
 48 |     sparkSession.sessionState.conf.columnNameOfCorruptRecord
 49 |   )
 50 | 
 51 |   override def isSplitable(path: Path): Boolean = false
 52 | 
 53 |   override def getFileUnSplittableReason(path: Path): String = {
 54 |     "No practical method of splitting an excel file"
 55 |   }
 56 | 
 57 |   override def createReaderFactory(): PartitionReaderFactory = {
 58 | 
 59 |     /* Check a field requirement for corrupt records here to throw an exception in a driver side
 60 |      */
 61 |     ExprUtils.verifyColumnNameOfCorruptRecord(dataSchema, parsedOptions.columnNameOfCorruptRecord)
 62 | 
 63 |     if (
 64 |       readDataSchema.length == 1 &&
 65 |       readDataSchema.head.name == parsedOptions.columnNameOfCorruptRecord
 66 |     ) {
 67 |       throw new RuntimeException(
 68 |         "Queries from raw Excel files are disallowed when the referenced " +
 69 |           "columns only include the internal corrupt record column"
 70 |       )
 71 |     }
 72 | 
 73 |     val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
 74 | 
 75 |     /* Hadoop Configurations are case sensitive. */
 76 |     val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
 77 | 
 78 |     val broadcastedConf = sparkSession.sparkContext
 79 |       .broadcast(new SerializableConfiguration(hadoopConf))
 80 | 
 81 |     /* The partition values are already truncated in `FileScan.partitions`. We should use `readPartitionSchema` as the
 82 |      * partition schema here.
 83 |      */
 84 |     ExcelPartitionReaderFactory(
 85 |       sparkSession.sessionState.conf,
 86 |       broadcastedConf,
 87 |       dataSchema,
 88 |       readDataSchema,
 89 |       readPartitionSchema,
 90 |       parsedOptions,
 91 |       ArraySeq.unsafeWrapArray(pushedFilters)
 92 |     )
 93 |   }
 94 | 
 95 |   override def equals(obj: Any): Boolean = obj match {
 96 |     case c: ExcelScan =>
 97 |       super.equals(c) && dataSchema == c.dataSchema && options == c.options &&
 98 |       equivalentFilters(pushedFilters, c.pushedFilters)
 99 |     case _ => false
100 |   }
101 | 
102 |   override def hashCode(): Int = super.hashCode()
103 | 
104 |   override def description(): String = {
105 |     super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]")
106 |   }
107 | }
108 | 


--------------------------------------------------------------------------------
/src/main/3.3_and_up/scala/dev/mauch/spark/excel/v2/ExcelScanBuilder.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.connector.read.Scan
21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
22 | import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
23 | import org.apache.spark.sql.internal.connector.SupportsPushDownCatalystFilters
24 | import org.apache.spark.sql.types.StructType
25 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
26 | 
27 | case class ExcelScanBuilder(
28 |   sparkSession: SparkSession,
29 |   fileIndex: PartitioningAwareFileIndex,
30 |   schema: StructType,
31 |   dataSchema: StructType,
32 |   options: CaseInsensitiveStringMap
33 | ) extends FileScanBuilder(sparkSession, fileIndex, dataSchema)
34 |     with SupportsPushDownCatalystFilters {
35 | 
36 |   override def build(): Scan = {
37 |     ExcelScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedDataFilters)
38 |   }
39 | 
40 | }
41 | 


--------------------------------------------------------------------------------
/src/main/3.4.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException
20 | import org.apache.spark.unsafe.types.UTF8String
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | 
23 | trait ExcelParserBase {
24 | 
25 |   protected def getCurrentInput: UTF8String
26 |   def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException =
27 |     BadRecordException(() => getCurrentInput, () => partialResults, baseException)
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/3.4.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelPartitionReaderFactory.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import org.apache.hadoop.conf.Configuration
 20 | import org.apache.spark.broadcast.Broadcast
 21 | import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow}
 22 | import org.apache.spark.sql.connector.read.PartitionReader
 23 | import org.apache.spark.sql.execution.datasources.PartitionedFile
 24 | import org.apache.spark.sql.execution.datasources.v2._
 25 | import org.apache.spark.sql.internal.SQLConf
 26 | import org.apache.spark.sql.sources.Filter
 27 | import org.apache.spark.sql.types.StructType
 28 | import org.apache.spark.util.SerializableConfiguration
 29 | 
 30 | import java.net.URI
 31 | import scala.util.control.NonFatal
 32 | 
 33 | /** A factory used to create Excel readers.
 34 |   *
 35 |   * @param sqlConf
 36 |   *   SQL configuration.
 37 |   * @param broadcastedConf
 38 |   *   Broadcasted serializable Hadoop Configuration.
 39 |   * @param dataSchema
 40 |   *   Schema of Excel files.
 41 |   * @param readDataSchema
 42 |   *   Required data schema in the batch scan.
 43 |   * @param partitionSchema
 44 |   *   Schema of partitions.
 45 |   * @param parsedOptions
 46 |   *   Options for parsing Excel files.
 47 |   */
 48 | case class ExcelPartitionReaderFactory(
 49 |   sqlConf: SQLConf,
 50 |   broadcastedConf: Broadcast[SerializableConfiguration],
 51 |   dataSchema: StructType,
 52 |   readDataSchema: StructType,
 53 |   partitionSchema: StructType,
 54 |   parsedOptions: ExcelOptions,
 55 |   filters: Seq[Filter]
 56 | ) extends FilePartitionReaderFactory {
 57 |   protected def options: FileSourceOptions = new FileSourceOptions(
 58 |     Map(FileSourceOptions.IGNORE_CORRUPT_FILES -> "true", FileSourceOptions.IGNORE_MISSING_FILES -> "true")
 59 |   )
 60 |   override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = {
 61 |     val conf = broadcastedConf.value.value
 62 |     val actualDataSchema =
 63 |       StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
 64 |     val actualReadDataSchema =
 65 |       StructType(readDataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
 66 |     val parser = new ExcelParser(actualDataSchema, actualReadDataSchema, parsedOptions, filters)
 67 |     val headerChecker =
 68 |       new ExcelHeaderChecker(actualReadDataSchema, parsedOptions, source = s"Excel file: ${file.filePath}")
 69 |     val iter = readFile(conf, file, parser, headerChecker, readDataSchema)
 70 |     val partitionReader = new SparkExcelPartitionReaderFromIterator(iter)
 71 |     new PartitionReaderWithPartitionValues(partitionReader, readDataSchema, partitionSchema, file.partitionValues)
 72 |   }
 73 | 
 74 |   private def readFile(
 75 |     conf: Configuration,
 76 |     file: PartitionedFile,
 77 |     parser: ExcelParser,
 78 |     headerChecker: ExcelHeaderChecker,
 79 |     requiredSchema: StructType
 80 |   ): SheetData[InternalRow] = {
 81 |     val excelHelper = ExcelHelper(parsedOptions)
 82 |     val sheetData = excelHelper.getSheetData(conf, URI.create(file.filePath.toString))
 83 |     try {
 84 |       SheetData(
 85 |         ExcelParser.parseIterator(sheetData.rowIterator, parser, headerChecker, requiredSchema),
 86 |         sheetData.resourcesToClose
 87 |       )
 88 |     } catch {
 89 |       case NonFatal(t) => {
 90 |         sheetData.close()
 91 |         throw t
 92 |       }
 93 |     }
 94 |   }
 95 | 
 96 | }
 97 | 
 98 | private class SparkExcelPartitionReaderFromIterator(sheetData: SheetData[InternalRow])
 99 |     extends PartitionReaderFromIterator[InternalRow](sheetData.rowIterator) {
100 |   override def close(): Unit = {
101 |     super.close()
102 |     sheetData.close()
103 |   }
104 | }
105 | 


--------------------------------------------------------------------------------
/src/main/3.4_and_up/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql.catalyst.FileSourceOptions
20 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
21 | import org.apache.spark.sql.internal.SQLConf
22 | 
23 | class ExcelOptions(
24 |   @transient
25 |   val parameters: CaseInsensitiveMap[String],
26 |   val defaultTimeZoneId: String,
27 |   val defaultColumnNameOfCorruptRecord: String
28 | ) extends FileSourceOptions(parameters)
29 |     with ExcelOptionsTrait {
30 | 
31 |   def this(parameters: Map[String, String], defaultTimeZoneId: String) = {
32 |     this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord)
33 |   }
34 | 
35 |   def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = {
36 |     this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord)
37 |   }
38 | 
39 | }
40 | 


--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | dev.mauch.spark.excel.v2.ExcelDataSource
2 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/DefaultSource.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.sql.sources._
21 | import org.apache.spark.sql.types.StructType
22 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
23 | 
24 | class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
25 | 
26 |   /** Creates a new relation for retrieving data from an Excel file
27 |     */
28 |   override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): ExcelRelation =
29 |     createRelation(sqlContext, parameters, null)
30 | 
31 |   /** Creates a new relation for retrieving data from an Excel file
32 |     */
33 |   override def createRelation(
34 |     sqlContext: SQLContext,
35 |     parameters: Map[String, String],
36 |     schema: StructType
37 |   ): ExcelRelation = {
38 |     val conf = sqlContext.sparkSession.sessionState.newHadoopConf()
39 |     val wbReader = WorkbookReader(parameters, conf)
40 |     val dataLocator = DataLocator(parameters)
41 |     ExcelRelation(
42 |       header = checkParameter(parameters, "header").toBoolean,
43 |       treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(false)(_.toBoolean),
44 |       setErrorCellsToFallbackValues = parameters.get("setErrorCellsToFallbackValues").fold(false)(_.toBoolean),
45 |       usePlainNumberFormat = parameters.get("usePlainNumberFormat").fold(false)(_.toBoolean),
46 |       userSchema = Option(schema),
47 |       inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
48 |       addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
49 |       timestampFormat = parameters.get("timestampFormat"),
50 |       dateFormat = parameters.get("dateFormat"),
51 |       excerptSize = parameters.get("excerptSize").fold(10)(_.toInt),
52 |       dataLocator = dataLocator,
53 |       workbookReader = wbReader
54 |     )(sqlContext)
55 |   }
56 | 
57 |   override def createRelation(
58 |     sqlContext: SQLContext,
59 |     mode: SaveMode,
60 |     parameters: Map[String, String],
61 |     data: DataFrame
62 |   ): BaseRelation = {
63 |     val path = checkParameter(parameters, "path")
64 |     val header = checkParameter(parameters, "header").toBoolean
65 |     val filesystemPath = new Path(path)
66 |     val conf = sqlContext.sparkSession.sessionState.newHadoopConf()
67 |     val fs = filesystemPath.getFileSystem(conf)
68 |     new ExcelFileSaver(
69 |       fs,
70 |       filesystemPath,
71 |       data,
72 |       saveMode = mode,
73 |       header = header,
74 |       dataLocator = DataLocator(parameters)
75 |     ).save()
76 | 
77 |     createRelation(sqlContext, parameters, data.schema)
78 |   }
79 | 
80 |   // Forces a Parameter to exist, otherwise an exception is thrown.
81 |   private def checkParameter(map: Map[String, String], param: String): String = {
82 |     if (!map.contains(param)) {
83 |       throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.")
84 |     } else {
85 |       map.apply(param)
86 |     }
87 |   }
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/DefaultSource15.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.spark.sql.sources.DataSourceRegister
20 | 
21 | class DefaultSource15 extends DefaultSource with DataSourceRegister {
22 |   override def shortName(): String = "excel"
23 | }
24 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/ExcelFileSaver.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
20 | import org.apache.poi.xssf.usermodel.XSSFWorkbook
21 | import org.apache.spark.sql.{DataFrame, SaveMode}
22 | import spoiwo.model._
23 | import spoiwo.natures.xlsx.Model2XlsxConversions._
24 | 
25 | import java.io.BufferedOutputStream
26 | import scala.jdk.CollectionConverters._
27 | 
28 | object ExcelFileSaver {
29 |   final val DEFAULT_SHEET_NAME = "Sheet1"
30 |   final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm"
31 |   final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000"
32 | }
33 | 
34 | class ExcelFileSaver(
35 |   fs: FileSystem,
36 |   location: Path,
37 |   dataFrame: DataFrame,
38 |   saveMode: SaveMode,
39 |   dataLocator: DataLocator,
40 |   header: Boolean = true
41 | ) {
42 |   def save(): Unit = {
43 |     def sheet(workbook: XSSFWorkbook) = {
44 |       val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None
45 |       val dataRows = dataFrame
46 |         .toLocalIterator()
47 |         .asScala
48 |         .map(_.toSeq)
49 |       dataLocator.toSheet(headerRow, dataRows, workbook)
50 |     }
51 |     val fileAlreadyExists = fs.exists(location)
52 |     def writeToWorkbook(workbook: XSSFWorkbook): Unit = {
53 |       Workbook(sheet(workbook)).writeToExisting(workbook)
54 |       autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write)
55 |     }
56 |     (fileAlreadyExists, saveMode) match {
57 |       case (false, _) | (_, SaveMode.Overwrite) =>
58 |         if (fileAlreadyExists) {
59 |           fs.delete(location, true)
60 |         }
61 |         writeToWorkbook(new XSSFWorkbook())
62 |       case (true, SaveMode.ErrorIfExists) =>
63 |         sys.error(s"path $location already exists.")
64 |       case (true, SaveMode.Ignore) => ()
65 |       case (true, SaveMode.Append) =>
66 |         val inputStream: FSDataInputStream = fs.open(location)
67 |         val workbook = new XSSFWorkbook(inputStream)
68 |         inputStream.close()
69 |         writeToWorkbook(workbook)
70 |     }
71 |   }
72 | 
73 |   def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = {
74 |     try {
75 |       fun(closeable)
76 |     } finally {
77 |       closeable.close()
78 |     }
79 |   }
80 | }
81 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/InferSchema.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.spark.rdd.RDD
20 | import org.apache.spark.sql.types._
21 | 
22 | private[excel] object InferSchema {
23 |   type CellType = Int
24 | 
25 |   /** Similar to the JSON schema inference. [[org.apache.spark.sql.execution.datasources.json.InferSchema]]
26 |     *   1. Infer type of each row 2. Merge row types to find common type 3. Replace any null types with string type
27 |     */
28 |   def apply(rowsRDD: RDD[Seq[DataType]]): Array[DataType] = {
29 |     val startType: Array[DataType] = Array.empty
30 |     val rootTypes: Array[DataType] = rowsRDD.aggregate(startType)(inferRowType, mergeRowTypes)
31 | 
32 |     rootTypes.map {
33 |       case _: NullType => StringType
34 |       case other => other
35 |     }
36 |   }
37 | 
38 |   private def inferRowType(rowSoFar: Array[DataType], next: Seq[DataType]): Array[DataType] = {
39 |     val maxLength = math.max(rowSoFar.length, next.size)
40 |     val defaultDataType: Int => DataType = (_ => NullType)
41 |     val filledRowSoFar = Array.tabulate(maxLength)(n => rowSoFar.applyOrElse[Int, DataType](n, defaultDataType))
42 |     val filledNext = Array.tabulate(maxLength)(n => next.applyOrElse[Int, DataType](n, defaultDataType))
43 |     filledRowSoFar.zip(filledNext).map { case (r, n) => inferField(r, n) }
44 |   }
45 | 
46 |   private[excel] def mergeRowTypes(first: Array[DataType], second: Array[DataType]): Array[DataType] = {
47 |     first.zipAll(second, NullType, NullType).map { case ((a, b)) =>
48 |       findTightestCommonType(a, b).getOrElse(NullType)
49 |     }
50 |   }
51 | 
52 |   /** Infer type of string field. Given known type Double, and a string "1", there is no point checking if it is an Int,
53 |     * as the final type must be Double or higher.
54 |     */
55 |   private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = {
56 |     // Defining a function to return the StringType constant is necessary in order to work around
57 |     // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions;
58 |     // see issue #128 for more details.
59 |     def stringType(): DataType = {
60 |       StringType
61 |     }
62 | 
63 |     if (field == NullType) {
64 |       typeSoFar
65 |     } else {
66 |       (typeSoFar, field) match {
67 |         case (NullType, ct) => ct
68 |         case (DoubleType, DoubleType) => DoubleType
69 |         case (BooleanType, BooleanType) => BooleanType
70 |         case (TimestampType, TimestampType) => TimestampType
71 |         case (StringType, _) => stringType()
72 |         case (_, _) => stringType()
73 |       }
74 |     }
75 |   }
76 | 
77 |   /** Copied from internal Spark api [[org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion]]
78 |     */
79 |   private val numericPrecedence: IndexedSeq[DataType] =
80 |     IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType)
81 | 
82 |   /** Copied from internal Spark api [[org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion]]
83 |     */
84 |   val findTightestCommonType: (DataType, DataType) => Option[DataType] = {
85 |     case (t1, t2) if t1 == t2 => Some(t1)
86 |     case (NullType, t1) => Some(t1)
87 |     case (t1, NullType) => Some(t1)
88 |     case (StringType, _) => Some(StringType)
89 |     case (_, StringType) => Some(StringType)
90 | 
91 |     // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
92 |     case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>
93 |       val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)
94 |       Some(numericPrecedence(index))
95 | 
96 |     case _ => None
97 |   }
98 | }
99 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/PlainNumberFormat.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import java.math.BigDecimal
20 | import java.text.FieldPosition
21 | import java.text.Format
22 | import java.text.ParsePosition
23 | 
24 | /** A format that formats a double as a plain string without rounding and scientific notation. All other operations are
25 |   * unsupported.
26 |   * @see
27 |   *   [[org.apache.poi.ss.usermodel.ExcelGeneralNumberFormat]] and SSNFormat from
28 |   *   [[org.apache.poi.ss.usermodel.DataFormatter]] from Apache POI.
29 |   */
30 | object PlainNumberFormat extends Format {
31 | 
32 |   override def format(number: AnyRef, toAppendTo: StringBuffer, pos: FieldPosition): StringBuffer = {
33 |     // Convert to BigDecimal for formatting
34 |     val bd = new BigDecimal(number.toString)
35 |     // Check if the number is an integer (scale == 0 after stripping trailing zeros)
36 |     val stripped = bd.stripTrailingZeros()
37 |     if (stripped.scale() <= 0) {
38 |       // It's an integer, format without decimal point
39 |       toAppendTo.append(stripped.toBigInteger().toString())
40 |     } else {
41 |       // It's not an integer, format as plain string
42 |       toAppendTo.append(bd.toPlainString)
43 |     }
44 |   }
45 | 
46 |   override def parseObject(source: String, pos: ParsePosition): AnyRef =
47 |     throw new UnsupportedOperationException()
48 | }
49 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/Utils.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | import scala.util.{Success, Try}
19 | 
20 | object Utils {
21 |   implicit class RichTry[T](t: Try[T]) {
22 |     def toEither: Either[Throwable, T] = t.transform(s => Success(Right(s)), f => Success(Left(f))).get
23 |   }
24 | 
25 |   case class MapIncluding[K](keys: Seq[K], optionally: Seq[K] = Seq()) {
26 |     def unapply[V](m: Map[K, V]): Option[(Seq[V], Seq[Option[V]])] =
27 |       if (keys.forall(m.contains)) {
28 |         Some((keys.map(m), optionally.map(m.get)))
29 |       } else {
30 |         None
31 |       }
32 |   }
33 |   sealed trait MapRequirements[K] {
34 |     type ResultType[V]
35 |     def unapplySeq[V](m: Map[K, V]): Option[ResultType[V]]
36 |   }
37 |   case class RequiredKeys[K](keys: K*) extends MapRequirements[K] {
38 |     type ResultType[V] = Seq[V]
39 |     def unapplySeq[V](m: Map[K, V]): Option[Seq[V]] =
40 |       if (keys.forall(m.contains)) {
41 |         Some(keys.map(m))
42 |       } else {
43 |         None
44 |       }
45 |   }
46 |   case class OptionalKeys[K](keys: K*) extends MapRequirements[K] {
47 |     type ResultType[V] = Seq[Option[V]]
48 |     def unapplySeq[V](m: Map[K, V]): Option[Seq[Option[V]]] = Some(keys.map(m.get))
49 |   }
50 |   case class MapWith[K](
51 |     requiredKeys: RequiredKeys[K] = RequiredKeys[K](),
52 |     optionalKeys: OptionalKeys[K] = OptionalKeys[K]()
53 |   ) {
54 |     def unapply[V](m: Map[K, V]): Option[(requiredKeys.ResultType[V], optionalKeys.ResultType[V])] =
55 |       for {
56 |         req <- requiredKeys.unapplySeq(m)
57 |         opt <- optionalKeys.unapplySeq(m)
58 |       } yield (req, opt)
59 |   }
60 | }
61 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/WorkbookReader.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel
 18 | 
 19 | import java.io.InputStream
 20 | import dev.mauch.spark.excel.Utils.MapIncluding
 21 | import com.github.pjfanning.xlsx.StreamingReader
 22 | import org.apache.hadoop.conf.Configuration
 23 | import org.apache.hadoop.fs.{FileSystem, Path}
 24 | import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory}
 25 | import org.apache.poi.hssf.usermodel.HSSFWorkbookFactory
 26 | import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource
 27 | import org.apache.poi.util.IOUtils
 28 | import org.apache.poi.xssf.usermodel.XSSFWorkbookFactory
 29 | import scala.collection.JavaConverters.mapAsScalaMapConverter
 30 | 
 31 | trait WorkbookReader {
 32 |   protected def openWorkbook(): Workbook
 33 |   def withWorkbook[T](f: Workbook => T): T = {
 34 |     val workbook = openWorkbook()
 35 |     val res = f(workbook)
 36 |     workbook.close()
 37 |     res
 38 |   }
 39 |   def sheetNames: Seq[String] = {
 40 |     withWorkbook(workbook =>
 41 |       for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield {
 42 |         workbook.getSheetAt(sheetIx).getSheetName()
 43 |       }
 44 |     )
 45 |   }
 46 | }
 47 | 
 48 | object WorkbookReader {
 49 |   val WithLocationMaxRowsInMemoryAndPassword =
 50 |     MapIncluding(
 51 |       Seq("path"),
 52 |       optionally = Seq("maxRowsInMemory", "workbookPassword", "maxByteArraySize", "tempFileThreshold")
 53 |     )
 54 | 
 55 |   WorkbookFactory.addProvider(new HSSFWorkbookFactory)
 56 |   WorkbookFactory.addProvider(new XSSFWorkbookFactory)
 57 | 
 58 |   def apply(parameters: java.util.HashMap[String, String], hadoopConfiguration: Configuration): WorkbookReader = {
 59 |     apply(parameters.asScala.toMap, hadoopConfiguration)
 60 |   }
 61 | 
 62 |   def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = {
 63 |     def readFromHadoop(location: String) = {
 64 |       val path = new Path(location)
 65 |       FileSystem.get(path.toUri, hadoopConfiguration).open(path)
 66 |     }
 67 |     parameters match {
 68 |       case WithLocationMaxRowsInMemoryAndPassword(
 69 |             Seq(location),
 70 |             Seq(Some(maxRowsInMemory), passwordOption, maxByteArraySizeOption, tempFileThreshold)
 71 |           ) =>
 72 |         new StreamingWorkbookReader(
 73 |           readFromHadoop(location),
 74 |           passwordOption,
 75 |           maxRowsInMemory.toInt,
 76 |           maxByteArraySizeOption.map(_.toInt),
 77 |           tempFileThreshold.map(_.toInt)
 78 |         )
 79 |       case WithLocationMaxRowsInMemoryAndPassword(
 80 |             Seq(location),
 81 |             Seq(None, passwordOption, maxByteArraySizeOption, tempFileThresholdOption)
 82 |           ) =>
 83 |         new DefaultWorkbookReader(
 84 |           readFromHadoop(location),
 85 |           passwordOption,
 86 |           maxByteArraySizeOption.map(_.toInt),
 87 |           tempFileThresholdOption.map(_.toInt)
 88 |         )
 89 |     }
 90 |   }
 91 | }
 92 | class DefaultWorkbookReader(
 93 |   inputStreamProvider: => InputStream,
 94 |   workbookPassword: Option[String],
 95 |   maxByteArraySize: Option[Int],
 96 |   tempFileThreshold: Option[Int]
 97 | ) extends WorkbookReader {
 98 | 
 99 |   protected def openWorkbook(): Workbook = {
100 |     maxByteArraySize.foreach { maxSize =>
101 |       IOUtils.setByteArrayMaxOverride(maxSize)
102 |     }
103 |     tempFileThreshold.foreach { threshold =>
104 |       ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(threshold)
105 |     }
106 |     workbookPassword
107 |       .fold(WorkbookFactory.create(inputStreamProvider))(password =>
108 |         WorkbookFactory.create(inputStreamProvider, password)
109 |       )
110 |   }
111 | }
112 | 
113 | class StreamingWorkbookReader(
114 |   inputStreamProvider: => InputStream,
115 |   workbookPassword: Option[String],
116 |   maxRowsInMem: Int,
117 |   maxByteArraySize: Option[Int],
118 |   tempFileThreshold: Option[Int]
119 | ) extends WorkbookReader {
120 |   override protected def openWorkbook(): Workbook = {
121 |     maxByteArraySize.foreach { maxSize =>
122 |       IOUtils.setByteArrayMaxOverride(maxSize)
123 |     }
124 |     tempFileThreshold.foreach { threshold =>
125 |       ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(threshold)
126 |     }
127 |     val builder = StreamingReader
128 |       .builder()
129 |       .rowCacheSize(maxRowsInMem)
130 |       .bufferSize(4096)
131 |     workbookPassword
132 |       .fold(builder)(password => builder.password(password))
133 |       .open(inputStreamProvider)
134 |   }
135 | }
136 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/package.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark
 18 | 
 19 | import org.apache.poi.ss.usermodel.Row.MissingCellPolicy
 20 | import org.apache.poi.ss.usermodel.{Cell, CellType, Row}
 21 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter}
 22 | import spoiwo.model.Sheet
 23 | 
 24 | package object excel {
 25 |   implicit class RichRow(val row: Row) extends AnyVal {
 26 |     def eachCellIterator(startColumn: Int, endColumn: Int): Iterator[Option[Cell]] =
 27 |       new Iterator[Option[Cell]] {
 28 |         private val lastCellInclusive = row.getLastCellNum - 1
 29 |         private val endCol = Math.min(endColumn, Math.max(startColumn, lastCellInclusive))
 30 |         require(startColumn >= 0 && startColumn <= endCol)
 31 | 
 32 |         private var nextCol = startColumn
 33 | 
 34 |         override def hasNext: Boolean = nextCol <= endCol && nextCol <= lastCellInclusive
 35 | 
 36 |         override def next(): Option[Cell] = {
 37 |           val next =
 38 |             if (nextCol > endCol) throw new NoSuchElementException(s"column index = $nextCol")
 39 |             else Option(row.getCell(nextCol, MissingCellPolicy.RETURN_NULL_AND_BLANK))
 40 |           nextCol += 1
 41 |           next
 42 |         }
 43 |       }
 44 |   }
 45 | 
 46 |   implicit class RichCell(val cell: Cell) extends AnyVal {
 47 |     def value: Any =
 48 |       cell.getCellType match {
 49 |         case CellType.BLANK | CellType.ERROR | CellType._NONE => null
 50 |         case CellType.NUMERIC => cell.getNumericCellValue
 51 |         case CellType.STRING => cell.getStringCellValue
 52 |         case CellType.BOOLEAN => cell.getBooleanCellValue
 53 |         case CellType.FORMULA =>
 54 |           cell.getCachedFormulaResultType match {
 55 |             case CellType.BLANK => null
 56 |             case CellType.NUMERIC => cell.getNumericCellValue
 57 |             case CellType.STRING => cell.getRichStringCellValue
 58 |             case CellType.BOOLEAN => cell.getBooleanCellValue
 59 |             case _ => null
 60 |           }
 61 |       }
 62 |   }
 63 | 
 64 |   implicit class RichSpoiwoSheet(val sheet: Sheet) extends AnyVal {
 65 |     def extractTableData(tableNumber: Int): Seq[Seq[Any]] = {
 66 |       val table = sheet.tables(tableNumber)
 67 |       val (startRow, endRow) = table.cellRange.rowRange
 68 |       val (startColumn, endColumn) = table.cellRange.columnRange
 69 |       val tableRows = sheet.rows.filter(r => r.index.exists((startRow to endRow).contains))
 70 |       tableRows.map(_.cells.filter(_.index.exists((startColumn to endColumn).contains)).map(_.value).toSeq)
 71 |     }
 72 |   }
 73 | 
 74 |   implicit class ExcelDataFrameReader(val dataFrameReader: DataFrameReader) extends AnyVal {
 75 |     def excel(
 76 |       header: Boolean = true,
 77 |       treatEmptyValuesAsNulls: Boolean = false,
 78 |       setErrorCellsToFallbackValues: Boolean = false,
 79 |       inferSchema: Boolean = false,
 80 |       usePlainNumberFormat: Boolean = false,
 81 |       addColorColumns: Boolean = false,
 82 |       dataAddress: String = null,
 83 |       timestampFormat: String = null,
 84 |       maxRowsInMemory: java.lang.Integer = null,
 85 |       maxByteArraySize: java.lang.Integer = null,
 86 |       tempFileThreshold: java.lang.Integer = null,
 87 |       excerptSize: Int = 10,
 88 |       workbookPassword: String = null
 89 |     ): DataFrameReader = {
 90 |       Map(
 91 |         "header" -> header,
 92 |         "treatEmptyValuesAsNulls" -> treatEmptyValuesAsNulls,
 93 |         "setErrorCellsToFallbackValues" -> setErrorCellsToFallbackValues,
 94 |         "usePlainNumberFormat" -> usePlainNumberFormat,
 95 |         "inferSchema" -> inferSchema,
 96 |         "addColorColumns" -> addColorColumns,
 97 |         "dataAddress" -> dataAddress,
 98 |         "timestampFormat" -> timestampFormat,
 99 |         "maxRowsInMemory" -> maxRowsInMemory,
100 |         "maxByteArraySize" -> maxByteArraySize,
101 |         "tempFileThreshold" -> tempFileThreshold,
102 |         "excerptSize" -> excerptSize,
103 |         "workbookPassword" -> workbookPassword
104 |       ).foldLeft(dataFrameReader.format("dev.mauch.spark.excel")) { case (dfReader, (key, value)) =>
105 |         value match {
106 |           case null => dfReader
107 |           case v => dfReader.option(key, v.toString)
108 |         }
109 |       }
110 |     }
111 |   }
112 | 
113 |   implicit class ExcelDataFrameWriter[T](val dataFrameWriter: DataFrameWriter[T]) extends AnyVal {
114 |     def excel(
115 |       header: Boolean = true,
116 |       dataAddress: String = null,
117 |       preHeader: String = null,
118 |       dateFormat: String = null,
119 |       timestampFormat: String = null,
120 |       workbookPassword: String = null
121 |     ): DataFrameWriter[T] = {
122 |       Map(
123 |         "header" -> header,
124 |         "dataAddress" -> dataAddress,
125 |         "dateFormat" -> dateFormat,
126 |         "timestampFormat" -> timestampFormat,
127 |         "workbookPassword" -> workbookPassword,
128 |         "preHeader" -> preHeader
129 |       ).foldLeft(dataFrameWriter.format("dev.mauch.spark.excel")) { case (dfWriter, (key, value)) =>
130 |         value match {
131 |           case null => dfWriter
132 |           case v => dfWriter.option(key, v.toString)
133 |         }
134 |       }
135 |     }
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/v2/ExcelHeaderChecker.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.internal.Logging
20 | import org.apache.spark.sql.internal.SQLConf
21 | import org.apache.spark.sql.types.StructType
22 | 
23 | /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case
24 |   * sensitivity.
25 |   *
26 |   * @param schema
27 |   *   provided (or inferred) schema to which Excel must conform.
28 |   * @param options
29 |   *   parsed Excel options.
30 |   * @param source
31 |   *   name of Excel source that are currently checked. It is used in error messages.
32 |   */
33 | class ExcelHeaderChecker(schema: StructType, options: ExcelOptions, source: String) extends Logging {
34 | 
35 |   /** Indicates if it is set to `false`, comparison of column names and schema field names is not case sensitive.
36 |     */
37 |   private val caseSensitive = SQLConf.get.caseSensitiveAnalysis
38 | 
39 |   /** Indicates if it is `true`, column names are ignored otherwise the Excel column names are checked for conformance
40 |     * to the schema. In the case if the column name don't conform to the schema, an exception is thrown.
41 |     */
42 |   private val enforceSchema = options.enforceSchema
43 | 
44 |   /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case
45 |     * sensitivity.
46 |     *
47 |     * @param columnNames
48 |     *   names of Excel columns that must be checked against to the schema.
49 |     */
50 |   def checkHeaderColumnNames(columnNames: Vector[String]): Unit = {
51 |     if (columnNames != null) {
52 |       val fieldNames = schema.map(_.name).toIndexedSeq
53 |       val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
54 |       var errorMessage: Option[String] = None
55 | 
56 |       if (headerLen == schemaSize) {
57 |         var i = 0
58 |         while (errorMessage.isEmpty && i < headerLen) {
59 |           var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
60 |           if (!caseSensitive) {
61 |             // scalastyle:off caselocale
62 |             nameInSchema = nameInSchema.toLowerCase
63 |             nameInHeader = nameInHeader.toLowerCase
64 |             // scalastyle:on caselocale
65 |           }
66 |           if (nameInHeader != nameInSchema) {
67 |             errorMessage = Some(s"""|Excel header does not conform to the schema.
68 |                   | Header: ${columnNames.mkString(", ")}
69 |                   | Schema: ${fieldNames.mkString(", ")}
70 |                   |Expected: ${fieldNames(i)} but found: ${columnNames(i)}
71 |                   |$source""".stripMargin)
72 |           }
73 |           i += 1
74 |         }
75 |       } else {
76 |         errorMessage = Some(s"""|Number of column in Excel header is not equal to number of fields in the schema:
77 |               | Header length: $headerLen, schema size: $schemaSize
78 |               |$source""".stripMargin)
79 |       }
80 | 
81 |       errorMessage.foreach { msg =>
82 |         if (enforceSchema) { logWarning(msg) }
83 |         else { throw new IllegalArgumentException(msg) }
84 |       }
85 |     }
86 |   }
87 | 
88 | }
89 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/v2/ExcelOptionsTrait.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import org.apache.spark.sql.catalyst.util.{
 20 |   CaseInsensitiveMap,
 21 |   DateFormatter,
 22 |   DateTimeUtils,
 23 |   ParseMode,
 24 |   PermissiveMode,
 25 |   TimestampFormatter
 26 | }
 27 | 
 28 | import java.time.ZoneId
 29 | import java.util.Locale
 30 | import scala.annotation.nowarn
 31 | 
 32 | trait ExcelOptionsTrait extends Serializable {
 33 | 
 34 |   val parameters: CaseInsensitiveMap[String]
 35 |   val defaultTimeZoneId: String
 36 |   val defaultColumnNameOfCorruptRecord: String
 37 | 
 38 |   private def getInt(paramName: String): Option[Int] = {
 39 |     val paramValue = parameters.get(paramName)
 40 |     paramValue match {
 41 |       case None => None
 42 |       case Some(null) => None
 43 |       case Some(value) =>
 44 |         try {
 45 |           Some(value.toInt)
 46 |         } catch {
 47 |           case _: NumberFormatException =>
 48 |             throw new RuntimeException(s"$paramName should be an integer. Found $value")
 49 |         }
 50 |     }
 51 |   }
 52 | 
 53 |   private def getBool(paramName: String, default: Boolean): Boolean = {
 54 |     val param = parameters.getOrElse(paramName, default.toString)
 55 |     if (param == null) {
 56 |       default
 57 |     } else if (param.toLowerCase(Locale.ROOT) == "true") {
 58 |       true
 59 |     } else if (param.toLowerCase(Locale.ROOT) == "false") {
 60 |       false
 61 |     } else {
 62 |       throw new Exception(s"$paramName flag can be true or false")
 63 |     }
 64 |   }
 65 | 
 66 |   /* Parsing mode, how to handle corrupted record. Default to permissive */
 67 |   val parseMode: ParseMode = parameters
 68 |     .get("mode")
 69 |     .map(ParseMode.fromString)
 70 |     .getOrElse(PermissiveMode)
 71 | 
 72 |   val zoneId: ZoneId = ZoneId
 73 |     .of(parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId))
 74 | 
 75 |   /* A language tag in IETF BCP 47 format */
 76 |   val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)
 77 | 
 78 |   val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)
 79 | 
 80 |   @nowarn
 81 |   val timestampFormat: String = parameters.getOrElse("timestampFormat", TimestampFormatter.defaultPattern)
 82 | 
 83 |   /* Have header line when reading and writing */
 84 |   val header = getBool("header", default = true)
 85 | 
 86 |   /* Number of rows to ignore after header. Only in reading */
 87 |   val ignoreAfterHeader = getInt("ignoreAfterHeader").getOrElse(0)
 88 | 
 89 |   val inferSchema = getBool("inferSchema", default = false)
 90 |   val excerptSize = getInt("excerptSize")
 91 | 
 92 |   /** Forcibly apply the specified or inferred schema to data files. If the option is enabled, headers of ABC files will
 93 |     * be ignored.
 94 |     */
 95 |   val enforceSchema = getBool("enforceSchema", default = true)
 96 | 
 97 |   /* Name for column of corrupted records */
 98 |   val columnNameOfCorruptRecord = parameters
 99 |     .getOrElse("columnNameOfCorruptRecord", defaultColumnNameOfCorruptRecord)
100 | 
101 |   val nullValue = parameters.getOrElse("nullValue", "")
102 |   val nanValue = parameters.getOrElse("nanValue", "NaN")
103 |   val positiveInf = parameters.getOrElse("positiveInf", "Inf")
104 |   val negativeInf = parameters.getOrElse("negativeInf", "-Inf")
105 | 
106 |   /* If true, format the cells without rounding and scientific notations */
107 |   val usePlainNumberFormat = getBool("usePlainNumberFormat", default = false)
108 | 
109 |   /* If true, keep undefined (Excel) rows */
110 |   val keepUndefinedRows = getBool("keepUndefinedRows", default = false)
111 | 
112 |   /* Use null value for error cells */
113 |   val useNullForErrorCells = getBool("useNullForErrorCells", default = false)
114 | 
115 |   /* Additional column for color */
116 |   val addColorColumns = getBool("addColorColumns", default = false)
117 |   val ignoreLeadingWhiteSpace = getBool("ignoreLeadingWhiteSpace", default = false)
118 |   val ignoreTrailingWhiteSpace = getBool("ignoreTrailingWhiteSpace", default = false)
119 | 
120 |   /* Additional column for excel row number */
121 |   val columnNameOfRowNumber = parameters.get("columnNameOfRowNumber")
122 | 
123 |   /* Data address, default to everything */
124 |   val dataAddress = parameters.getOrElse("dataAddress", "A1")
125 | 
126 |   /* Workbook password, optional */
127 |   val workbookPassword = parameters.get("workbookPassword")
128 | 
129 |   /* Output excel file extension, default to xlsx */
130 |   val fileExtension = parameters.get("fileExtension") match {
131 |     case Some(value) => value.trim
132 |     case None => "xlsx"
133 |   }
134 | 
135 |   /* Defines fraction of file used for schema inferring. For default and
136 |      invalid values, 1.0 will be used */
137 |   val samplingRatio = {
138 |     val r = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
139 |     if (r > 1.0 || r <= 0.0) 1.0 else r
140 |   }
141 | 
142 |   /** Optional parameter for using a streaming reader which can help with big files (will fail if used with xls format
143 |     * files)
144 |     */
145 |   val maxRowsInMemory = getInt("maxRowsInMemory")
146 | 
147 |   // scalastyle:off
148 |   /** Optional parameter for <a
149 |     * href="https://poi.apache.org/apidocs/5.0/org/apache/poi/util/IOUtils.html#setByteArrayMaxOverride-int-">maxByteArraySize</a>
150 |     */
151 |   val maxByteArraySize = getInt("maxByteArraySize")
152 | 
153 |   // scalastyle:on
154 |   /** Optional parameter for specifying the number of bytes at which a zip entry is regarded as too large for holding in
155 |     * memory and the data is put in a temp file instead - useful for sheets with a lot of data
156 |     */
157 |   val tempFileThreshold = getInt("tempFileThreshold")
158 | 
159 |   // scalastyle:on
160 |   /** Optional parameter to specify whether the sheet name in dataAddress is a regex (for loading multiple sheets at
161 |     * once) or the true sheet name
162 |     */
163 |   val sheetNameIsRegex = getBool("sheetNameIsRegex", false)
164 | }
165 | 


--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/v2/SheetData.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import java.io.Closeable
20 | 
21 | case class SheetData[T](rowIterator: Iterator[T], resourcesToClose: Seq[Closeable] = Seq.empty) extends Closeable {
22 |   def modifyIterator(f: Iterator[T] => Iterator[T]): SheetData[T] = SheetData(f(rowIterator), resourcesToClose)
23 |   def append(other: SheetData[T]): SheetData[T] =
24 |     SheetData(rowIterator ++ other.rowIterator, resourcesToClose ++ other.resourcesToClose)
25 |   override def close(): Unit = resourcesToClose.foreach(_.close())
26 | }
27 | 


--------------------------------------------------------------------------------
/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
 1 | # config for log4j 1.x (spark < 3.3)
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.out
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
 7 | 
 8 | # config for log4j 2.x (spark >= 3.3)
 9 | # Extra logging related to initialization of Log4j
10 | # Set to debug or trace if log4j initialization is failing
11 | status = warn
12 | 
13 | 
14 | # Console appender configuration
15 | appender.console.type = Console
16 | appender.console.name = consoleLogger
17 | appender.console.layout.type = PatternLayout
18 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
19 | 
20 | # Root logger level
21 | rootLogger.level = warn
22 | # Root logger referring to console appender
23 | rootLogger.appenderRef.stdout.ref = consoleLogger
24 | 


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/Issue_747_plain_number.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/Issue_747_plain_number.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/apache_poi/57231_MixedGasReport.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/apache_poi/57231_MixedGasReport.xls


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/apache_poi/DataTableCities.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/apache_poi/DataTableCities.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=1/ca_03.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=1/ca_03.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_04.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_04.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_05.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_05.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_06.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_06.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_07.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_07.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_08.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_08.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_09.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_09.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_10.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_10.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_11.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_11.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_12.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_12.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/infer_stricter_numerical_types.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/infer_stricter_numerical_types.xls


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/infer_stricter_numerical_types.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/infer_stricter_numerical_types.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_162_nihar_gharat.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_162_nihar_gharat.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_285_bryce21.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_285_bryce21.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_463_cristichircu.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_463_cristichircu.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_942_sheetname_digits.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_942_sheetname_digits.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_944_faulty_dimension.md:
--------------------------------------------------------------------------------
 1 | The issue_944_faulty_dimension.xlsx file contains `<dimension>` tags on each sheet, that does not conform to the true / physical size of the sheets (e.g.  `<dimension ref="A1"/>` instead of `<dimension ref="A1:E2"/>` for sheet1).
 2 | 
 3 | It was fabricated by hand and is used to test the library's ability to handle such cases. 
 4 | 
 5 | This is how the file was created:
 6 | * take a valid excel file
 7 | * rename extension from xlsx to zip
 8 | * unzip it
 9 | * patch the `<dimension>` tags in `xl/worksheets/sheet1.xml` and `xl/worksheets/sheet2.xml`
10 | * zip it back
11 | * rename extension back to xlsx
12 | 


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_944_faulty_dimension.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_944_faulty_dimension.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_965_blank_rows.md:
--------------------------------------------------------------------------------
 1 | The issue_965_blank_rows.xlsx is used to test that rows containing no values are discarded if read with keepUndefinedRows == False.
 2 | 
 3 | The Excel was fabricated by hand and is used to test the library's ability to handle such cases. 
 4 | 
 5 | This is how the file was created:
 6 | * take a valid excel file
 7 | * rename extension from xlsx to zip
 8 | * unzip it
 9 | * add empty row definitions to `xl/worksheets/sheet1.xml` (see) below)
10 | * zip it back
11 | * rename extension back to xlsx
12 | 
13 | 
14 | The empty row definitions added to the file are as follows:
15 | ```xml
16 | <row r="5" spans="1:7" x14ac:dyDescent="0.25">
17 |     <c r="A5" s="1"/>
18 |     <c r="B5" s="1"/>
19 | </row>
20 | ````


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_965_blank_rows.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_965_blank_rows.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/plain_number.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/plain_number.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/read_multiple_sheets_at_once.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/read_multiple_sheets_at_once.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/read_multiple_sheets_at_once_noheader.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/read_multiple_sheets_at_once_noheader.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/simple_encrypted.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/simple_encrypted.xls


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/simple_encrypted.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/simple_encrypted.xlsx


--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/with_errors_all_types.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/with_errors_all_types.xlsx


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/DataFrameSuiteBase.scala:
--------------------------------------------------------------------------------
  1 | package dev.mauch.spark
  2 | 
  3 | import com.github.mrpowers.spark.fast.tests.DataFrameComparer
  4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
  5 | 
  6 | import java.sql.Timestamp
  7 | 
  8 | trait DataFrameSuiteBase extends DataFrameComparer {
  9 | 
 10 |   lazy val spark: SparkSession = SparkSession
 11 |     .builder()
 12 |     .master("local")
 13 |     .appName("spark-excel session")
 14 |     .config("spark.sql.shuffle.partitions", "1")
 15 |     .getOrCreate()
 16 | 
 17 |   def assertDataFrameEquals(df1: DataFrame, df2: DataFrame): Unit =
 18 |     assertSmallDataFrameEquality(df1, df2)
 19 | 
 20 |   def assertDataFrameApproximateEquals(expectedDF: DataFrame, actualDF: DataFrame, relTol: Double): Unit = {
 21 |     val e = (r1: Row, r2: Row) => {
 22 |       r1.equals(r2) || RelTolComparer.areRowsEqual(r1, r2, relTol)
 23 |     }
 24 |     assertLargeDatasetEquality[Row](
 25 |       actualDF,
 26 |       expectedDF,
 27 |       equals = e,
 28 |       ignoreNullable = false,
 29 |       ignoreColumnNames = false,
 30 |       orderedComparison = false
 31 |     )
 32 |   }
 33 | 
 34 |   def assertDataFrameNoOrderEquals(df1: DataFrame, df2: DataFrame): Unit =
 35 |     assertSmallDataFrameEquality(df1, df2, orderedComparison = false)
 36 | }
 37 | 
 38 | object RelTolComparer {
 39 | 
 40 |   trait ToNumeric[T] {
 41 |     def toNumeric(x: Double): T
 42 |   }
 43 |   object ToNumeric {
 44 |     implicit val doubleToDouble: ToNumeric[Double] = new ToNumeric[Double] {
 45 |       def toNumeric(x: Double): Double = x
 46 |     }
 47 |     implicit val doubleToFloat: ToNumeric[Float] = new ToNumeric[Float] {
 48 |       def toNumeric(x: Double): Float = x.toFloat
 49 |     }
 50 |     implicit val doubleToLong: ToNumeric[Long] = new ToNumeric[Long] {
 51 |       def toNumeric(x: Double): Long = x.toLong
 52 |     }
 53 |     implicit val doubleToBigDecimal: ToNumeric[BigDecimal] = new ToNumeric[BigDecimal] {
 54 |       def toNumeric(x: Double): BigDecimal = BigDecimal(x)
 55 |     }
 56 |   }
 57 | 
 58 |   /** Approximate equality, based on equals from [[Row]] */
 59 |   def areRowsEqual(r1: Row, r2: Row, relTol: Double): Boolean = {
 60 |     def withinRelTol[T : Numeric : ToNumeric](a: T, b: T): Boolean = {
 61 |       val num = implicitly[Numeric[T]]
 62 |       val toNum = implicitly[ToNumeric[T]]
 63 |       val absTol = num.times(toNum.toNumeric(relTol), num.max(num.abs(a), num.abs(b)))
 64 |       val diff = num.abs(num.minus(a, b))
 65 |       num.lteq(diff, absTol)
 66 |     }
 67 | 
 68 |     if (r1.length != r2.length) {
 69 |       return false
 70 |     } else {
 71 |       (0 until r1.length).foreach(idx => {
 72 |         if (r1.isNullAt(idx) != r2.isNullAt(idx)) {
 73 |           return false
 74 |         }
 75 | 
 76 |         if (!r1.isNullAt(idx)) {
 77 |           val o1 = r1.get(idx)
 78 |           val o2 = r2.get(idx)
 79 |           o1 match {
 80 |             case b1: Array[Byte] =>
 81 |               if (!java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
 82 |                 return false
 83 |               }
 84 | 
 85 |             case f1: Float =>
 86 |               if (
 87 |                 java.lang.Float.isNaN(f1) !=
 88 |                   java.lang.Float.isNaN(o2.asInstanceOf[Float])
 89 |               ) {
 90 |                 return false
 91 |               }
 92 |               if (!withinRelTol[Float](f1, o2.asInstanceOf[Float])) {
 93 |                 return false
 94 |               }
 95 | 
 96 |             case d1: Double =>
 97 |               if (
 98 |                 java.lang.Double.isNaN(d1) !=
 99 |                   java.lang.Double.isNaN(o2.asInstanceOf[Double])
100 |               ) {
101 |                 return false
102 |               }
103 |               if (!withinRelTol[Double](d1, o2.asInstanceOf[Double])) {
104 |                 return false
105 |               }
106 | 
107 |             case d1: java.math.BigDecimal =>
108 |               if (!withinRelTol(BigDecimal(d1), BigDecimal(o2.asInstanceOf[java.math.BigDecimal]))) {
109 |                 return false
110 |               }
111 | 
112 |             case t1: Timestamp =>
113 |               if (!withinRelTol(t1.getTime, o2.asInstanceOf[Timestamp].getTime)) {
114 |                 return false
115 |               }
116 | 
117 |             case _ =>
118 |               if (o1 != o2) return false
119 |           }
120 |         }
121 |       })
122 |     }
123 |     true
124 |   }
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/DataLocatorSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.poi.xssf.usermodel.XSSFWorkbook
20 | import org.scalacheck.Gen
21 | import org.scalatest.funspec.AnyFunSpec
22 | import org.scalatest.matchers.should.Matchers
23 | import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks
24 | import spoiwo.model.Workbook
25 | import spoiwo.natures.xlsx.Model2XlsxConversions._
26 | 
27 | import scala.jdk.CollectionConverters._
28 | import scala.collection.compat._
29 | 
30 | class DataLocatorSuite extends AnyFunSpec with ScalaCheckPropertyChecks with Matchers with Generators {
31 |   describe("with a table reference") {
32 |     val dl = DataLocator(Map("dataAddress" -> s"$tableName[#All]"))
33 |     describe("containing #All") {
34 |       it("extracts the entire table data") {
35 |         forAll(sheetWithTableGen) { sheet =>
36 |           val actualData = dl.readFrom(sheet.convertAsXlsx()).map(_.map(_.value)).to(Seq)
37 |           actualData should contain theSameElementsAs sheet.extractTableData(0)
38 |         }
39 |       }
40 | 
41 |       it("writes into a new table in a new sheet if no corresponding table exists") {
42 |         forAll(sheetGenerator(withHeader = Gen.const(true), numCols = Gen.choose(1, 200))) { dataSheet =>
43 |           val workbook = new XSSFWorkbook()
44 |           val header = dataSheet.rows.head.cells.map(_.value.toString).toSeq
45 |           val generatedSheet = dl.toSheet(
46 |             header = Some(header),
47 |             data = dataSheet.rows.tail.iterator.map(_.cells.map(_.value.toString).toSeq),
48 |             existingWorkbook = workbook
49 |           )
50 |           generatedSheet.convertAsXlsx(workbook)
51 |           val pTable = workbook.getTable(tableName)
52 |           pTable.getSheetName should equal(tableName)
53 |           pTable.getColumns.asScala.map(_.getName) should contain theSameElementsInOrderAs header
54 |           val actualData = dl.readFrom(workbook).map(_.map(_.value)).to(Seq)
55 |           actualData should contain theSameElementsAs dataSheet.rows.map(_.cells.map(_.value))
56 |         }
57 |       }
58 | 
59 |       it("overwrites an existing table") {
60 |         forAll(sheetWithTableGen) { sheetWithTable =>
61 |           val workbook = sheetWithTable.convertAsXlsx()
62 |           val table = sheetWithTable.tables.head
63 |           val header = table.columns.map(_.name)
64 |           val tableData = dl.readFrom(workbook).map(_.map(c => s"new_$c")).toList
65 |           val generatedSheet =
66 |             dl.toSheet(header = tableData.headOption, data = tableData.iterator.drop(1), existingWorkbook = workbook)
67 |           Workbook(generatedSheet).writeToExisting(workbook)
68 |           val pTable = workbook.getTable(tableName)
69 |           pTable.getSheetName should equal(sheetName)
70 |           pTable.getColumns.asScala.map(_.getName) should contain theSameElementsInOrderAs header
71 |           val actualData = dl.readFrom(workbook).map(_.map(_.value)).to(Seq)
72 |           actualData should contain theSameElementsAs tableData
73 |         }
74 |       }
75 |     }
76 |   }
77 |   describe("without any dataAddress") {
78 |     it("defaults to starting at cell A1 in the first sheet") {
79 |       val dl = DataLocator(Map())
80 |       dl shouldBe a[CellRangeAddressDataLocator]
81 |       val cradl = dl.asInstanceOf[CellRangeAddressDataLocator]
82 |       cradl.dataAddress.getFirstCell.formatAsString() should equal("A1")
83 |       cradl.dataAddress.getFirstCell.getSheetName should equal(null)
84 |     }
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/EncryptedReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.types._
21 | 
22 | import dev.mauch.spark.DataFrameSuiteBase
23 | import org.scalatest.funspec.AnyFunSpec
24 | import org.scalatest.matchers.should.Matchers
25 | import scala.jdk.CollectionConverters._
26 | 
27 | object EncryptedReadSuite {
28 |   val simpleSchema = StructType(
29 |     List(
30 |       StructField("A", DoubleType, true),
31 |       StructField("B", DoubleType, true),
32 |       StructField("C", DoubleType, true),
33 |       StructField("D", DoubleType, true)
34 |     )
35 |   )
36 | 
37 |   val expectedData = List(Row(1.0d, 2.0d, 3.0d, 4.0d)).asJava
38 | }
39 | 
40 | class EncryptedReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers {
41 |   import EncryptedReadSuite._
42 | 
43 |   lazy val expected = spark.createDataFrame(expectedData, simpleSchema)
44 | 
45 |   def readFromResources(path: String, password: String, maxRowsInMemory: Option[Int] = None): DataFrame = {
46 |     val url = getClass.getResource(path)
47 |     val reader = spark.read
48 |       .excel(
49 |         dataAddress = s"Sheet1!A1",
50 |         treatEmptyValuesAsNulls = true,
51 |         workbookPassword = password,
52 |         inferSchema = true
53 |       )
54 |     val withMaxRows = maxRowsInMemory.fold(reader)(rows => reader.option("maxRowsInMemory", s"$rows"))
55 |     withMaxRows.load(url.getPath)
56 |   }
57 | 
58 |   describe("spark-excel") {
59 |     it("should read encrypted xslx file") {
60 |       val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba")
61 | 
62 |       assertDataFrameEquals(expected, df)
63 |     }
64 | 
65 |     it("should read encrypted xlsx file with maxRowsInMem=10") {
66 |       val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba", maxRowsInMemory = Some(10))
67 | 
68 |       assertDataFrameEquals(expected, df)
69 |     }
70 | 
71 |     it("should read encrypted xlsx file with maxRowsInMem=1") {
72 |       val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba", maxRowsInMemory = Some(1))
73 | 
74 |       assertDataFrameEquals(expected, df)
75 |     }
76 | 
77 |     it("should read encrypted xls file") {
78 |       val df = readFromResources("/spreadsheets/simple_encrypted.xls", "fooba")
79 | 
80 |       assertDataFrameEquals(expected, df)
81 |     }
82 |   }
83 | }
84 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/ErrorsAsStringsReadSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql.types._
 21 | import org.apache.spark.sql.{Row, _}
 22 | import org.scalatest.funspec.AnyFunSpec
 23 | import org.scalatest.matchers.should.Matchers
 24 | 
 25 | import java.sql.Timestamp
 26 | import java.time.LocalDateTime
 27 | import java.util
 28 | import scala.jdk.CollectionConverters._
 29 | 
 30 | object ErrorsAsStringsReadSuite {
 31 |   private val dummyTimestamp = Timestamp.valueOf(LocalDateTime.of(2021, 2, 19, 0, 0))
 32 |   private val epochTimestamp = new Timestamp(0)
 33 |   private val dummyText = "hello"
 34 | 
 35 |   private val expectedSchemaInfer = StructType(
 36 |     List(
 37 |       StructField("double", DoubleType, true),
 38 |       StructField("boolean", BooleanType, true),
 39 |       StructField("timestamp", TimestampType, true),
 40 |       StructField("string", StringType, true),
 41 |       StructField("formula", StringType, true)
 42 |     )
 43 |   )
 44 |   private val expectedDataErrorsAsStringsInfer: util.List[Row] =
 45 |     List(
 46 |       Row(1.0, true, dummyTimestamp, dummyText, "A1"),
 47 |       Row(2.0, false, dummyTimestamp, dummyText, "A3"),
 48 |       Row(0.0, false, epochTimestamp, "", ""),
 49 |       Row(0.0, false, epochTimestamp, "", "")
 50 |     ).asJava
 51 | 
 52 |   private val expectedDataErrorsAsNullInfer: util.List[Row] =
 53 |     List(
 54 |       Row(1.0, true, dummyTimestamp, dummyText, "A1"),
 55 |       Row(2.0, false, dummyTimestamp, dummyText, "A3"),
 56 |       Row(null, null, null, null, null),
 57 |       Row(null, null, null, null, null)
 58 |     ).asJava
 59 | 
 60 |   private val expectedSchemaNonInfer = StructType(
 61 |     List(
 62 |       StructField("double", StringType, true),
 63 |       StructField("boolean", StringType, true),
 64 |       StructField("timestamp", StringType, true),
 65 |       StructField("string", StringType, true),
 66 |       StructField("formula", StringType, true)
 67 |     )
 68 |   )
 69 |   private val expectedDataErrorsAsStringsNonInfer: util.List[Row] =
 70 |     List(
 71 |       Row("1", "TRUE", "19\"-\"Feb\"-\"2021", dummyText, "A1"),
 72 |       Row("2", "FALSE", "19\"-\"Feb\"-\"2021", dummyText, "A3"),
 73 |       Row("", "", "", "", ""),
 74 |       Row("", "", "", "", "")
 75 |     ).asJava
 76 | 
 77 |   private val expectedDataErrorsAsNullNonInfer: util.List[Row] =
 78 |     List(
 79 |       Row("1", "TRUE", "19\"-\"Feb\"-\"2021", "hello", "A1"),
 80 |       Row("2", "FALSE", "19\"-\"Feb\"-\"2021", "hello", "A3"),
 81 |       Row(null, null, null, null, null),
 82 |       Row(null, null, null, null, null)
 83 |     ).asJava
 84 | 
 85 |   private val excelLocation = "/spreadsheets/with_errors_all_types.xlsx"
 86 | }
 87 | 
 88 | class ErrorsAsStringsReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers {
 89 |   import ErrorsAsStringsReadSuite._
 90 | 
 91 |   def readFromResources(path: String, setErrorCellsToFallbackValues: Boolean, inferSchema: Boolean): DataFrame = {
 92 |     val url = getClass.getResource(path)
 93 |     spark.read
 94 |       .excel(setErrorCellsToFallbackValues = setErrorCellsToFallbackValues, inferSchema = inferSchema, excerptSize = 3)
 95 |       .load(url.getPath)
 96 |   }
 97 | 
 98 |   describe("spark-excel") {
 99 |     it("should read errors in string format when setErrorCellsToFallbackValues=true and inferSchema=true") {
100 |       val df = readFromResources(excelLocation, true, true)
101 |       val expected = spark.createDataFrame(expectedDataErrorsAsStringsInfer, expectedSchemaInfer)
102 |       assertDataFrameEquals(expected, df)
103 |     }
104 | 
105 |     it("should read errors as null when setErrorCellsToFallbackValues=false and inferSchema=true") {
106 |       val df = readFromResources(excelLocation, false, true)
107 |       val expected = spark.createDataFrame(expectedDataErrorsAsNullInfer, expectedSchemaInfer)
108 |       assertDataFrameEquals(expected, df)
109 |     }
110 | 
111 |     it("should read errors in string format when setErrorCellsToFallbackValues=true and inferSchema=false") {
112 |       val df = readFromResources(excelLocation, true, false)
113 |       val expected = spark.createDataFrame(expectedDataErrorsAsStringsNonInfer, expectedSchemaNonInfer)
114 |       assertDataFrameEquals(expected, df)
115 |     }
116 | 
117 |     it("should read errors in string format when setErrorCellsToFallbackValues=false and inferSchema=false") {
118 |       val df = readFromResources(excelLocation, false, false)
119 |       val expected = spark.createDataFrame(expectedDataErrorsAsNullNonInfer, expectedSchemaNonInfer)
120 |       assertDataFrameEquals(expected, df)
121 |     }
122 |   }
123 | }
124 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/RichRowSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel
18 | 
19 | import org.apache.poi.ss.usermodel.{Cell, Row}
20 | import org.scalacheck.Gen
21 | import org.scalacheck.Prop.propBoolean
22 | import org.scalamock.scalatest.MockFactory
23 | 
24 | import scala.util.Try
25 | import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks
26 | import org.scalatest.funsuite.AnyFunSuite
27 | 
28 | trait RowGenerator extends MockFactory {
29 |   private val MAX_WIDTH = 100
30 | 
31 |   protected case class GeneratedRow(start: Int, end: Int, lastCellNum: Int, row: Row)
32 | 
33 |   protected val rowGen: Gen[GeneratedRow] = for {
34 |     startColumn <- Gen.choose(0, MAX_WIDTH - 1)
35 |     endColumn <- Gen.choose(0, MAX_WIDTH - 1)
36 |     lastCellNum <- Gen.choose(0, MAX_WIDTH - 1)
37 |     row = stub[Row]
38 |     _ = (row.getCell(_: Int)).when(*) returns stub[Cell]
39 |     _ = (row.getLastCellNum _).when() returns lastCellNum.toShort
40 |   } yield GeneratedRow(startColumn, endColumn, lastCellNum, row)
41 | }
42 | 
43 | class RichRowSuite extends AnyFunSuite with ScalaCheckPropertyChecks with RowGenerator {
44 |   test("Invalid cell range should throw an error") {
45 |     forAll(rowGen) { g =>
46 |       (g.start > g.end) ==> Try {
47 |         g.row.eachCellIterator(g.start, g.end).next()
48 |       }.isFailure
49 |     }
50 |   }
51 | 
52 |   test("Valid cell range should iterate through all non-empty cells") {
53 |     forAll(rowGen) { g =>
54 |       (g.start <= g.end && g.start < g.lastCellNum) ==> {
55 |         val count = g.row.eachCellIterator(g.start, g.end).size
56 |         count === Math.min(g.end, g.lastCellNum - 1) - g.start + 1
57 |       }
58 |     }
59 |   }
60 | 
61 |   test("Valid cell range should should not iterate through non-empty cells") {
62 |     forAll(rowGen) { g =>
63 |       (g.start <= g.end && g.start >= g.lastCellNum) ==> {
64 |         g.row.eachCellIterator(g.start, g.end).size === 0
65 |       }
66 |     }
67 |   }
68 | }
69 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/AreaReferenceReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 | 
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 | 
27 | /** Loading data from difference data address (AreaReference)
28 |   */
29 | object AreaReferenceReadSuite {
30 |   val expectedSchema_01 = StructType(
31 |     List(
32 |       StructField("Translations!$A$370", StringType, true),
33 |       StructField("Translations!$A$371", LongType, true),
34 |       StructField("Translations!$A$402", DoubleType, true),
35 |       StructField("Translations!$A$393", DoubleType, true),
36 |       StructField("Translations!$A$384", DoubleType, true),
37 |       StructField("Translations!$A$405", DoubleType, true),
38 |       StructField("Translations!$A$396", DoubleType, true),
39 |       StructField("Translations!$A$387", DoubleType, true),
40 |       StructField("Translations!$A$418", DoubleType, true),
41 |       StructField("Translations!$A$419", DoubleType, true),
42 |       StructField("Translations!$A$4110", DoubleType, true)
43 |     )
44 |   )
45 | 
46 |   /* Manually checking 1 row only */
47 |   val expectedData_01: util.List[Row] = List(
48 |     Row(
49 |       "Alabama",
50 |       140895441L,
51 |       458d,
52 |       122d,
53 |       85116d,
54 |       1009700176.36684d,
55 |       268959435.626102d,
56 |       187645502645.503d,
57 |       0.0072d,
58 |       0.0019d,
59 |       1.3318d
60 |     )
61 |   ).asJava
62 | 
63 | }
64 | 
65 | class AreaReferenceReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
66 |   import AreaReferenceReadSuite._
67 | 
68 |   test("AreaReference from diffrence sheet with testing data from Apache POI upstream tests") {
69 |     val df = readFromResources(
70 |       spark,
71 |       path = "apache_poi/57231_MixedGasReport.xls",
72 |       options = Map("dataAddress" -> "'Coefficient Table'!A6", "ignoreAfterHeader" -> 2, "inferSchema" -> true)
73 |     ).limit(1)
74 |     val expected = spark.createDataFrame(expectedData_01, expectedSchema_01)
75 |     assertDataFrameApproximateEquals(expected, df, 0.1e-1)
76 |   }
77 | 
78 | }
79 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/DataFrameWriterApiComplianceSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql._
 21 | import org.scalatest.wordspec.AnyWordSpec
 22 | 
 23 | class DataFrameWriterApiComplianceSuite extends AnyWordSpec with DataFrameSuiteBase with LocalFileTestingUtilities {
 24 | 
 25 |   private def simpleDf = {
 26 |     val data = Seq(("foo", "bar", "1"), ("baz", "bang", "2"))
 27 |     spark.createDataFrame(data).toDF("col1", "col2", "col3")
 28 |   }
 29 | 
 30 |   /** Checks that the excel data files in given folder equal the provided dataframe */
 31 |   private def assertWrittenExcelData(expectedDf: DataFrame, folder: String): Unit = {
 32 |     val actualDf = spark.read
 33 |       .format("excel")
 34 |       .option("path", folder)
 35 |       .load()
 36 | 
 37 |     /* assertDataFrameNoOrderEquals is sensitive to order of columns, so we
 38 |       order both dataframes in the same way
 39 |      */
 40 |     val orderedSchemaColumns = expectedDf.schema.fields.map(f => f.name).sorted
 41 | 
 42 |     assertDataFrameNoOrderEquals(
 43 |       expectedDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*),
 44 |       actualDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
 45 |     )
 46 | 
 47 |   }
 48 |   "excel v2 complies to DataFrameWriter SaveMode and Partitioning behavior" can {
 49 | 
 50 |     val writeModes = Seq(SaveMode.Overwrite, SaveMode.Append)
 51 | 
 52 |     for (writeMode <- writeModes) {
 53 |       s"write a dataframe to xlsx with ${writeMode.toString}" in withExistingCleanTempDir("v2") { targetDir =>
 54 |         // create a df from csv then write as xlsx
 55 |         val df = simpleDf
 56 | 
 57 |         df.write
 58 |           .format("excel")
 59 |           .option("path", targetDir)
 60 |           .option("header", value = true)
 61 |           .mode(writeMode)
 62 |           .save()
 63 | 
 64 |         val listOfFiles = getListOfFilesFilteredByExtension(targetDir, "xlsx")
 65 |         assert(listOfFiles.nonEmpty, s"expected at least one excel file")
 66 | 
 67 |         // is the result really the same?
 68 |         assertWrittenExcelData(df, targetDir)
 69 | 
 70 |       }
 71 |       s"write a dataframe to xlsx with ${writeMode.toString} (partitioned)" in withExistingCleanTempDir("v2") {
 72 |         targetDir =>
 73 |           assume(spark.sparkContext.version >= "3.0.1")
 74 |           // create a df from csv then write as xlsx
 75 |           val df = simpleDf
 76 | 
 77 |           df.write
 78 |             .partitionBy("col1")
 79 |             .format("excel")
 80 |             .option("path", targetDir)
 81 |             .option("header", value = true)
 82 |             .mode(writeMode)
 83 |             .save()
 84 | 
 85 |           // some file based checks
 86 |           val listOfFolders = getListOfFolders(targetDir)
 87 |           assert(listOfFolders.length == 2, s"expected two folders because there are two partitions")
 88 |           for (folder <- listOfFolders) {
 89 |             assert(folder.getName.startsWith("col1="), s"expected partition folders and those must start with col1=")
 90 |             val listOfFiles = getListOfFilesFilteredByExtension(folder.getAbsolutePath, "xlsx")
 91 |             assert(listOfFiles.nonEmpty, s"expected at least one xlsx per folder but got $listOfFiles")
 92 |           }
 93 | 
 94 |           // is the result really the same?
 95 |           assertWrittenExcelData(df, targetDir)
 96 | 
 97 |       }
 98 |     }
 99 | 
100 |     for (isPartitioned <- Seq(false, true)) {
101 |       s"multiple appends to folder (partitioned == $isPartitioned)" in withExistingCleanTempDir("v2") { targetDir =>
102 |         if (isPartitioned) {
103 |           assume(spark.sparkContext.version >= "3.0.1")
104 |         }
105 | 
106 |         val df = simpleDf
107 | 
108 |         val dfWriter = if (isPartitioned) df.write else df.write.partitionBy("col1")
109 | 
110 |         dfWriter
111 |           .format("excel")
112 |           .option("path", targetDir)
113 |           .option("header", value = true)
114 |           .mode(SaveMode.Append)
115 |           .save()
116 |         dfWriter
117 |           .format("excel")
118 |           .option("path", targetDir)
119 |           .option("header", value = true)
120 |           .mode(SaveMode.Append)
121 |           .save()
122 | 
123 |         val orderedSchemaColumns = df.schema.fields.map(f => f.name).sorted
124 |         val expectedDf =
125 |           df.union(df).select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
126 | 
127 |         assertWrittenExcelData(expectedDf, targetDir)
128 |       }
129 |     }
130 |   }
131 | }
132 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/EncryptedReadSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql._
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 | 
24 | import scala.jdk.CollectionConverters._
25 | 
26 | object EncryptedReadSuite {
27 |   val simpleSchema = StructType(
28 |     List(
29 |       StructField("A", IntegerType, true),
30 |       StructField("B", IntegerType, true),
31 |       StructField("C", IntegerType, true),
32 |       StructField("D", IntegerType, true)
33 |     )
34 |   )
35 | 
36 |   val expectedData = List(Row(1, 2, 3, 4)).asJava
37 | }
38 | 
39 | class EncryptedReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
40 |   import EncryptedReadSuite._
41 | 
42 |   lazy val expected = spark.createDataFrame(expectedData, simpleSchema)
43 | 
44 |   test("read encrypted xslx file") {
45 |     val df = readFromResources(
46 |       spark,
47 |       path = "simple_encrypted.xlsx",
48 |       options = Map(
49 |         "dataAddress" -> "Sheet1!A1",
50 |         "treatEmptyValuesAsNulls" -> true,
51 |         "workbookPassword" -> "fooba",
52 |         "inferSchema" -> true
53 |       )
54 |     )
55 |     assertDataFrameEquals(expected, df)
56 |   }
57 | 
58 |   test("read encrypted xslx file (maxRowsInMemory)") {
59 |     val df = readFromResources(
60 |       spark,
61 |       path = "simple_encrypted.xlsx",
62 |       options = Map(
63 |         "dataAddress" -> "Sheet1!A1",
64 |         "treatEmptyValuesAsNulls" -> true,
65 |         "workbookPassword" -> "fooba",
66 |         "maxRowsInMemory" -> 1,
67 |         "inferSchema" -> true
68 |       )
69 |     )
70 |     assertDataFrameEquals(expected, df)
71 |   }
72 | 
73 |   test("read encrypted xls file") {
74 |     val df = readFromResources(
75 |       spark,
76 |       path = "simple_encrypted.xls",
77 |       options = Map(
78 |         "dataAddress" -> "Sheet1!A1",
79 |         "treatEmptyValuesAsNulls" -> true,
80 |         "workbookPassword" -> "fooba",
81 |         "inferSchema" -> true
82 |       )
83 |     )
84 |     assertDataFrameEquals(expected, df)
85 |   }
86 | }
87 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/ErrorsAsStringsReadSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql.Row
 21 | import org.apache.spark.sql.types._
 22 | import org.scalatest.funsuite.AnyFunSuite
 23 | 
 24 | import java.sql.Timestamp
 25 | import java.time.LocalDateTime
 26 | import java.util
 27 | import scala.jdk.CollectionConverters._
 28 | 
 29 | object ErrorsAsStringsReadSuite {
 30 |   private val dummyTimestamp = Timestamp.valueOf(LocalDateTime.of(2021, 2, 19, 0, 0))
 31 |   private val dummyText = "hello"
 32 | 
 33 |   private val expectedSchemaInfer = StructType(
 34 |     List(
 35 |       StructField("double", IntegerType, true),
 36 |       StructField("boolean", BooleanType, true),
 37 |       StructField("timestamp", TimestampType, true),
 38 |       StructField("string", StringType, true),
 39 |       StructField("formula", StringType, true)
 40 |     )
 41 |   )
 42 | 
 43 |   private val expectedDataErrorsAsNullInfer: util.List[Row] = List(
 44 |     Row(1, true, dummyTimestamp, dummyText, "A1"),
 45 |     Row(2, false, dummyTimestamp, dummyText, "A3"),
 46 |     Row(null, null, null, null, null),
 47 |     Row(null, null, null, null, null)
 48 |   ).asJava
 49 | 
 50 |   private val expectedDataErrorsAsStringsInfer: util.List[Row] = List(
 51 |     Row(1, true, dummyTimestamp, dummyText, "A1"),
 52 |     Row(2, false, dummyTimestamp, dummyText, "A3"),
 53 |     Row(null, null, null, "#NULL!", "#DIV/0!"),
 54 |     Row(null, null, null, "#N/A", "#NAME?")
 55 |   ).asJava
 56 | 
 57 |   private val expectedSchemaNonInfer = StructType(
 58 |     List(
 59 |       StructField("double", StringType, true),
 60 |       StructField("boolean", StringType, true),
 61 |       StructField("timestamp", StringType, true),
 62 |       StructField("string", StringType, true),
 63 |       StructField("formula", StringType, true)
 64 |     )
 65 |   )
 66 | 
 67 |   private val expectedDataErrorsAsNullNonInfer: util.List[Row] = List(
 68 |     Row("1", "TRUE", """19"-"Feb"-"2021""", "hello", "A1"),
 69 |     Row("2", "FALSE", """19"-"Feb"-"2021""", "hello", "A3"),
 70 |     Row(null, null, null, null, null),
 71 |     Row(null, null, null, null, null)
 72 |   ).asJava
 73 | 
 74 |   private val expectedDataErrorsAsStringsNonInfer: util.List[Row] = List(
 75 |     Row("1", "TRUE", """19"-"Feb"-"2021""", dummyText, "A1"),
 76 |     Row("2", "FALSE", """19"-"Feb"-"2021""", dummyText, "A3"),
 77 |     Row("#NULL!", "#NULL!", "#NULL!", "#NULL!", "#DIV/0!"),
 78 |     Row("#N/A", "#N/A", "#N/A", "#N/A", "#NAME?")
 79 |   ).asJava
 80 | }
 81 | 
 82 | /** Breaking change with V1: For Spark String Type field, Error Cell has an option to either get error value or null as
 83 |   * any other Spark Types
 84 |   *
 85 |   * Related issues: Support ERROR cell type when using inferSchema=true link:
 86 |   * https://github.dev/mauch/spark-excel/pull/343
 87 |   */
 88 | class ErrorsAsStringsReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
 89 |   import ErrorsAsStringsReadSuite._
 90 | 
 91 |   test("error cells as null when useNullForErrorCells=true and inferSchema=true") {
 92 |     val df = readFromResources(
 93 |       spark,
 94 |       path = "with_errors_all_types.xlsx",
 95 |       options = Map("inferSchema" -> true, "useNullForErrorCells" -> true)
 96 |     )
 97 |     val expected = spark.createDataFrame(expectedDataErrorsAsNullInfer, expectedSchemaInfer)
 98 |     assertDataFrameEquals(expected, df)
 99 |   }
100 | 
101 |   test("errors as null for non-string type with useNullForErrorCells=false and inferSchema=true") {
102 |     val df = readFromResources(
103 |       spark,
104 |       path = "with_errors_all_types.xlsx",
105 |       options = Map("inferSchema" -> true, "useNullForErrorCells" -> false)
106 |     )
107 |     val expected = spark.createDataFrame(expectedDataErrorsAsStringsInfer, expectedSchemaInfer)
108 |     assertDataFrameEquals(expected, df)
109 |   }
110 | 
111 |   test("errors in string format when useNullForErrorCells=true and inferSchema=false") {
112 |     val df = readFromResources(
113 |       spark,
114 |       path = "with_errors_all_types.xlsx",
115 |       options = Map("inferSchema" -> false, "useNullForErrorCells" -> true)
116 |     )
117 |     val expected = spark.createDataFrame(expectedDataErrorsAsNullNonInfer, expectedSchemaNonInfer)
118 |     assertDataFrameEquals(expected, df)
119 |   }
120 | 
121 |   test("errors in string format when useNullForErrorCells=false and inferSchema=false") {
122 |     val df = readFromResources(
123 |       spark,
124 |       path = "with_errors_all_types.xlsx",
125 |       options = Map("inferSchema" -> false, "useNullForErrorCells" -> false)
126 |     )
127 |     val expected = spark
128 |       .createDataFrame(expectedDataErrorsAsStringsNonInfer, expectedSchemaNonInfer)
129 |     assertDataFrameEquals(expected, df)
130 |   }
131 | 
132 | }
133 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/ExcelTestingUtilities.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.types.StructType
21 | import scala.reflect.io.Directory
22 | import java.io.File
23 | 
24 | trait ExcelTestingUtilities {
25 | 
26 |   private val dataRoot = getClass.getResource("/spreadsheets").getPath
27 | 
28 |   /** Load excel data from resource folder
29 |     *
30 |     * @param spark
31 |     *   spark session
32 |     * @param path
33 |     *   relative path to the resource/speadsheets
34 |     * @param options
35 |     *   extra loading option
36 |     * @return
37 |     *   data frame
38 |     */
39 |   def readFromResources(spark: SparkSession, path: String, options: Map[String, Any]): DataFrame =
40 |     spark.read
41 |       .format("excel")
42 |       .options(options.map(p => (p._1 -> p._2.toString())))
43 |       .load(s"$dataRoot/$path")
44 | 
45 |   /** Load excel data from resource folder with user defined schema
46 |     *
47 |     * @param spark
48 |     *   spark session
49 |     * @param path
50 |     *   relative path to the resource/speadsheets
51 |     * @param options
52 |     *   extra loading option
53 |     * @param schema
54 |     *   user provided schema
55 |     * @return
56 |     *   data frame
57 |     */
58 |   def readFromResources(spark: SparkSession, path: String, options: Map[String, Any], schema: StructType): DataFrame =
59 |     spark.read
60 |       .format("excel")
61 |       .options(options.map(p => (p._1 -> p._2.toString())))
62 |       .schema(schema)
63 |       .load(s"$dataRoot/$path")
64 | 
65 |   /** Delete directory recursively. Intended for temporary testing data only. Use with causion!
66 |     *
67 |     * @param path
68 |     *   to be deleted
69 |     */
70 |   def deleteDirectory(path: String): Unit = {
71 |     val directory = new Directory(new File(path))
72 |     directory.deleteRecursively()
73 |     ()
74 |   }
75 | }
76 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/GlobPartitionAndFileNameSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql.functions.input_file_name
 21 | import org.apache.spark.sql.types._
 22 | import org.scalatest.funsuite.AnyFunSuite
 23 | 
 24 | /** Issue References:
 25 |   *
 26 |   * #52. input_file_name returns empty string https://github.dev/mauch/spark-excel/issues/52
 27 |   *
 28 |   * #74. Allow reading multiple files specified as a list OR by a pattern https://github.dev/mauch/spark-excel/issues/74
 29 |   *
 30 |   * #97. Reading multiple files https://github.dev/mauch/spark-excel/issues/97
 31 |   */
 32 | 
 33 | object GlobPartitionAndFileNameSuite {
 34 |   val expectedInferredSchema = StructType(
 35 |     List(
 36 |       StructField("Day", IntegerType, true),
 37 |       StructField("Month", IntegerType, true),
 38 |       StructField("Customer ID", StringType, true),
 39 |       StructField("Customer Name", StringType, true),
 40 |       StructField("Standard Package", IntegerType, true),
 41 |       StructField("Extra Option 1", IntegerType, true),
 42 |       StructField("Extra Option 2", IntegerType, true),
 43 |       StructField("Extra Option 3", IntegerType, true),
 44 |       StructField("Staff", StringType, true)
 45 |     )
 46 |   )
 47 | 
 48 |   val expectedWithFilenameSchema = StructType(
 49 |     List(
 50 |       StructField("Day", IntegerType, true),
 51 |       StructField("Month", IntegerType, true),
 52 |       StructField("Customer ID", StringType, true),
 53 |       StructField("Customer Name", StringType, true),
 54 |       StructField("Standard Package", IntegerType, true),
 55 |       StructField("Extra Option 1", IntegerType, true),
 56 |       StructField("Extra Option 2", IntegerType, true),
 57 |       StructField("Extra Option 3", IntegerType, true),
 58 |       StructField("Staff", StringType, true),
 59 |       StructField("file_name", StringType, false)
 60 |     )
 61 |   )
 62 | 
 63 |   val expectedWithPartitionSchema = StructType(
 64 |     List(
 65 |       StructField("Day", IntegerType, true),
 66 |       StructField("Month", IntegerType, true),
 67 |       StructField("Customer ID", StringType, true),
 68 |       StructField("Customer Name", StringType, true),
 69 |       StructField("Standard Package", IntegerType, true),
 70 |       StructField("Extra Option 1", IntegerType, true),
 71 |       StructField("Extra Option 2", IntegerType, true),
 72 |       StructField("Extra Option 3", IntegerType, true),
 73 |       StructField("Staff", StringType, true),
 74 |       StructField("Quarter", IntegerType, true)
 75 |     )
 76 |   )
 77 | }
 78 | 
 79 | class GlobPartitionAndFileNameSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
 80 |   import GlobPartitionAndFileNameSuite._
 81 | 
 82 |   private val sharedOptions = Map("header" -> true, "inferSchema" -> true)
 83 | 
 84 |   test("read multiple files must infer correct schema with inferSchema=true") {
 85 |     val df = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions)
 86 |     assert(df.schema == expectedInferredSchema)
 87 |   }
 88 | 
 89 |   test("read multiple files with input_file_name") {
 90 |     val df = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions)
 91 |       .withColumn("file_name", input_file_name())
 92 |     assert(df.schema == expectedWithFilenameSchema)
 93 | 
 94 |     /* And validate list of filename */
 95 |     val names = df
 96 |       .select("file_name")
 97 |       .distinct()
 98 |       .collect()
 99 |       .map(r => r.getString(0))
100 |       .map(p => p.split("[\\/]").last) // this works on Windows too
101 |       .toSet
102 |     assert(names == Set[String]("ca_10.xlsx", "ca_11.xlsx", "ca_12.xlsx"))
103 |   }
104 | 
105 |   test("read whole folder with partition") {
106 |     val df = readFromResources(spark, "ca_dataset/2019", sharedOptions)
107 |     assert(df.schema == expectedWithPartitionSchema)
108 | 
109 |     /* And validate list of Quarters */
110 |     val quarters = df.select("Quarter").distinct().collect().map(r => r.getInt(0)).toSet
111 |     assert(quarters == Set[Int](1, 2, 3, 4))
112 |   }
113 | 
114 |   test("read multiple files must has same number total number of rows") {
115 |     val q4_total = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions)
116 |       .count()
117 | 
118 |     val q4_sum = Seq("ca_10.xlsx", "ca_11.xlsx", "ca_12.xlsx")
119 |       .map(name => readFromResources(spark, s"ca_dataset/2019/Quarter=4/$name", sharedOptions).count())
120 |       .sum
121 | 
122 |     assert(q4_total > 0)
123 |     assert(q4_total == q4_sum)
124 |   }
125 | 
126 | }
127 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/InferStricterNumericalTypesSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 | 
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 | 
27 | object InferStricterNumericalTypesSuite {
28 |   val expectedInferredSchema = StructType(
29 |     List(
30 |       StructField("ID", StringType, true),
31 |       StructField("Integer Value Range", IntegerType, true),
32 |       StructField("Long Value Range", LongType, true),
33 |       StructField("Double Value Range", DoubleType, true)
34 |     )
35 |   )
36 | 
37 |   /** Stricter type for numerical value
38 |     */
39 |   val expectedDataInferSchema: util.List[Row] = List(
40 |     Row("Gas & Oil", 2147482967, 92147483647L, 90315085.71d),
41 |     Row("Telecomunication", 2147483099, 102147483647L, -965079398.74d),
42 |     Row("Manufacturing", 2147482826, 112147483647L, -353020871.56d),
43 |     Row("Farming", 2147482838, -102147483647L, -446026564.15d),
44 |     Row("Service", 2147483356, -112147483647L, -820766945.73d)
45 |   ).asJava
46 | }
47 | 
48 | class InferStricterNumericalTypesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
49 |   import InferStricterNumericalTypesSuite._
50 | 
51 |   test("stricter numerical types usePlainNumberFormat=true and inferSchema=true (xlxs)") {
52 |     val df = readFromResources(
53 |       spark,
54 |       path = "infer_stricter_numerical_types.xlsx",
55 |       options = Map("usePlainNumberFormat" -> true, "inferSchema" -> true)
56 |     )
57 |     val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
58 |     assertDataFrameEquals(expected, df)
59 |   }
60 | 
61 |   test("stricter numerical types usePlainNumberFormat=false and inferSchema=true (xlxs)") {
62 |     val df = readFromResources(
63 |       spark,
64 |       path = "infer_stricter_numerical_types.xlsx",
65 |       options = Map("usePlainNumberFormat" -> false, "inferSchema" -> true)
66 |     )
67 |     val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
68 |     assertDataFrameEquals(expected, df)
69 |   }
70 | 
71 |   test("stricter numerical types usePlainNumberFormat=true and inferSchema=true (xls)") {
72 |     val df = readFromResources(
73 |       spark,
74 |       path = "infer_stricter_numerical_types.xls",
75 |       options = Map("usePlainNumberFormat" -> true, "inferSchema" -> true)
76 |     )
77 |     val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
78 |     assertDataFrameEquals(expected, df)
79 |   }
80 | 
81 |   test("stricter numerical types usePlainNumberFormat=false and inferSchema=true (xls)") {
82 |     val df = readFromResources(
83 |       spark,
84 |       path = "infer_stricter_numerical_types.xls",
85 |       options = Map("usePlainNumberFormat" -> false, "inferSchema" -> true)
86 |     )
87 |     val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
88 |     assertDataFrameEquals(expected, df)
89 |   }
90 | }
91 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/KeepUndefinedRowsSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql.Row
 21 | import org.apache.spark.sql.types._
 22 | import org.scalatest.funsuite.AnyFunSuite
 23 | 
 24 | import java.util
 25 | import scala.jdk.CollectionConverters._
 26 | 
 27 | object KeepUndefinedRowsSuite {
 28 | 
 29 |   /* Issue: https://github.dev/mauch/spark-excel/issues/285 */
 30 |   val expectedSchema_Issue285 = StructType(
 31 |     List(StructField("1", StringType, true), StructField("2", StringType, true), StructField("3", StringType, true))
 32 |   )
 33 | 
 34 |   /** No change to the spark-excel, Apache POI also produce same result with sheet.iterator
 35 |     *
 36 |     * Workaround: https://stackoverflow.com/questions/47790569/how-to-avoid-skipping-blank-rows-or-columns-in-apache-poi
 37 |     * Doc: http://poi.apache.org/components/spreadsheet/quick-guide.html#Iterator
 38 |     */
 39 |   val expectedData_Issue285: util.List[Row] = List(
 40 |     Row("File info", null, null),
 41 |     Row("Info", "Info", "Info"),
 42 |     Row("Metadata", null, null),
 43 |     Row(null, "1", "2"),
 44 |     Row("A", "1", "2"),
 45 |     Row("B", "5", "6"),
 46 |     Row("C", "9", "10"),
 47 |     Row("Metadata", null, null),
 48 |     Row(null, "1", "2"),
 49 |     Row("A", "1", "2"),
 50 |     Row("B", "4", "5"),
 51 |     Row("C", "7", "8")
 52 |   ).asJava
 53 | 
 54 |   /* With newly introduced keepUndefinedRows option */
 55 |   val expectedData_KeepUndefinedRows_Issue285: util.List[Row] = List(
 56 |     Row("File info", null, null),
 57 |     Row("Info", "Info", "Info"),
 58 |     Row(null, null, null),
 59 |     Row("Metadata", null, null),
 60 |     Row(null, null, null),
 61 |     Row(null, "1", "2"),
 62 |     Row("A", "1", "2"),
 63 |     Row("B", "5", "6"),
 64 |     Row("C", "9", "10"),
 65 |     Row(null, null, null),
 66 |     Row(null, null, null),
 67 |     Row("Metadata", null, null),
 68 |     Row(null, null, null),
 69 |     Row(null, "1", "2"),
 70 |     Row("A", "1", "2"),
 71 |     Row("B", "4", "5"),
 72 |     Row("C", "7", "8")
 73 |   ).asJava
 74 | 
 75 |   /** Issue: https://github.dev/mauch/spark-excel/issues/162 Spark-excel still infers to Double-Type, however, user can
 76 |     * provide custom scheme and Spark-excel should load to IntegerType or LongType accordingly
 77 |     */
 78 |   val userDefined_Issue162 = StructType(
 79 |     List(
 80 |       StructField("ID", IntegerType, true),
 81 |       StructField("address", StringType, true),
 82 |       StructField("Pin", IntegerType, true)
 83 |     )
 84 |   )
 85 | 
 86 |   val expectedData_Issue162: util.List[Row] =
 87 |     List(Row(123123, "Asdadsas, Xyxyxy, 123xyz", 123132), Row(123124, "Asdadsas1, Xyxyxy, 123xyz", 123133)).asJava
 88 | 
 89 | }
 90 | 
 91 | class KeepUndefinedRowsSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
 92 |   import KeepUndefinedRowsSuite._
 93 | 
 94 |   test("#285 undefined rows: no keep") {
 95 |     val df = readFromResources(
 96 |       spark,
 97 |       path = "issue_285_bryce21.xlsx",
 98 |       options = Map("header" -> false, "inferSchema" -> false, "keepUndefinedRows" -> false),
 99 |       schema = expectedSchema_Issue285
100 |     )
101 |     val expected = spark.createDataFrame(expectedData_Issue285, expectedSchema_Issue285)
102 |     assertDataFrameEquals(expected, df)
103 |   }
104 | 
105 |   test("#162 load integer values with user defined schema") {
106 |     val df = readFromResources(
107 |       spark,
108 |       path = "issue_162_nihar_gharat.xlsx",
109 |       options = Map("header" -> true),
110 |       schema = userDefined_Issue162
111 |     )
112 |     val expected = spark.createDataFrame(expectedData_Issue162, userDefined_Issue162)
113 |     assertDataFrameEquals(expected, df)
114 |   }
115 | 
116 |   for (sheetName <- Seq("blank_row", "space_row")) {
117 |     test(s"#965 handling of NULL/BLANK column values (streamingReader, keepUndefinedRows==false, sheet=$sheetName)") {
118 |       val df = readFromResources(
119 |         spark,
120 |         path = "issue_965_blank_rows.xlsx",
121 |         options = Map(
122 |           "dataAddress" -> s"'${sheetName}'!A1",
123 |           "inferSchema" -> true,
124 |           "header" -> true,
125 |           "maxRowsInMemory" -> "1000",
126 |           "keepUndefinedRows" -> false
127 |         )
128 |       )
129 |       assert(df.schema.fields.length == 5) // sheet 001 has 5 columns
130 |       /*
131 |         sheet "blank_row" has row 2 and 4 defined, while row 3 is not defined in excel xml and row 5 contains empty cells in excel xml
132 |         => 2 rows in total (prior the fix row 5 was added as well)
133 |         sheet "space_row" has row 2 and 4 defined with some values, row 3 contains just a whitespace in A3
134 |         => 3 rows in total (just to test that a single whitespace is handled correctly)
135 |        */
136 |       if (sheetName == "blank_row") {
137 |         assert(df.count() == 2)
138 |       } else {
139 |         assert(df.count() == 3)
140 |       }
141 |     }
142 |   }
143 | 
144 |   for (keepUndefinedRows <- Seq(false, true)) {
145 |     test(s"#965 handling of NULL/BLANK column values (NON-streaming-Reader, keepUndefinedRows==$keepUndefinedRows)") {
146 |       val df = readFromResources(
147 |         spark,
148 |         path = "issue_965_blank_rows.xlsx",
149 |         options = Map(
150 |           "dataAddress" -> s"'blank_row'!A1",
151 |           "inferSchema" -> true,
152 |           "header" -> true,
153 |           "keepUndefinedRows" -> keepUndefinedRows
154 |         )
155 |       )
156 |       assert(df.schema.fields.length == 5) // sheet 001 has 5 columns
157 |       /*
158 |         sheet "blank_row" has row 2 and 4 defined, while row 3 is not defined in excel xml and row 5 contains empty cells in excel xml
159 |        * keepUndefinedRows == true => 4 rows in total
160 |        * keepUndefinedRows == false => 2 rows in total
161 |        */
162 |       if (keepUndefinedRows) {
163 |         assert(df.count() == 4)
164 |       } else {
165 |         assert(df.count() == 2)
166 |       }
167 |     }
168 |   }
169 | 
170 | }
171 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/LocalFileTestingUtilities.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import java.io.File
20 | import java.nio.file.Files
21 | 
22 | trait LocalFileTestingUtilities {
23 | 
24 |   /** Returns the list of files in given directory/folder (this is not recursive)
25 |     */
26 |   def getListOfFiles(folder: String): List[File] = {
27 |     val d = new File(folder)
28 |     if (d.exists && d.isDirectory) {
29 |       d.listFiles.filter(_.isFile).toList
30 |     } else {
31 |       List[File]()
32 |     }
33 |   }
34 | 
35 |   /** similar to getListOfFiles but filters the files by the given file extension */
36 |   def getListOfFilesFilteredByExtension(targetDir: String, filteredByExtension: String): Seq[String] = {
37 |     val filesInTargetDir = getListOfFiles(targetDir)
38 |     filesInTargetDir.filter(_.getName.endsWith(filteredByExtension)).map(_.getName)
39 |   }
40 | 
41 |   /** Returns the list of folders in given directory/folder (this is not recursive */
42 |   def getListOfFolders(folder: String): List[File] = {
43 |     val d = new File(folder)
44 |     if (d.exists && d.isDirectory) {
45 |       d.listFiles.filter(_.isDirectory).toList
46 |     } else {
47 |       List[File]()
48 |     }
49 |   }
50 | 
51 |   /** Deletes the (non-empty) directory (recursively)
52 |     */
53 |   def deleteDirectoryRecursively(folderToDelete: File): Unit = {
54 |     val allContents = folderToDelete.listFiles
55 |     if (allContents != null) for (file <- allContents) {
56 |       deleteDirectoryRecursively(file)
57 |     }
58 |     folderToDelete.delete
59 |     ()
60 |   }
61 | 
62 |   /** fixture that creates a temporary folder and deletes it after test completion */
63 |   def withExistingCleanTempDir(name: String): (String => Unit) => Unit = {
64 | 
65 |     def fixture(testCode: String => Unit): Unit = {
66 | 
67 |       val directory = Files.createTempDirectory(name)
68 | 
69 |       try testCode(directory.toString)
70 |       finally deleteDirectoryRecursively(directory.toFile)
71 |     }
72 | 
73 |     fixture
74 |   }
75 | 
76 | }
77 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/ManyPartitionReadSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql._
 21 | import org.apache.spark.sql.functions.col
 22 | import org.apache.spark.sql.types.IntegerType
 23 | import org.scalatest.wordspec.AnyWordSpec
 24 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
 25 | 
 26 | class ManyPartitionReadSuite extends AnyWordSpec with DataFrameSuiteBase with LocalFileTestingUtilities {
 27 | 
 28 |   /** Checks that the excel data files in given folder equal the provided dataframe */
 29 |   private def assertWrittenExcelData(expectedDf: DataFrame, folder: String): Unit = {
 30 |     val actualDf = spark.read
 31 |       .format("excel")
 32 |       .option("path", folder)
 33 |       .load()
 34 | 
 35 |     /* assertDataFrameNoOrderEquals is sensitive to order of columns, so we
 36 |       order both dataframes in the same way
 37 |      */
 38 |     val orderedSchemaColumns = expectedDf.schema.fields.map(f => f.name).sorted
 39 | 
 40 |     assertDataFrameNoOrderEquals(
 41 |       expectedDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*),
 42 |       actualDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
 43 |     )
 44 | 
 45 |   }
 46 | 
 47 |   def createExpected(targetDir: String): DataFrame = {
 48 | 
 49 |     // Generate data programmatically
 50 |     val data = (1 to 19).flatMap { col1 =>
 51 |       // Each col1 value has multiple rows (around 10-11 rows each)
 52 |       val rowsPerPartition = if (col1 == 1) 8 else if (col1 == 2) 16 else 11
 53 |       (0 until rowsPerPartition).map { i =>
 54 |         val index = (col1 - 1) * 11 + i + 1234 // Starting from 1234 as in original data
 55 |         Row(
 56 |           Integer.valueOf(col1), // Make it nullable Integer
 57 |           s"fubar_$index",
 58 |           s"bazbang_${index + 77000}",
 59 |           s"barfang_${index + 237708}",
 60 |           s"lorem_ipsum_$index"
 61 |         )
 62 |       }
 63 |     }
 64 | 
 65 |     // Define schema explicitly to match expected nullability
 66 |     val schema = StructType(
 67 |       Array(
 68 |         StructField("col1", IntegerType, nullable = true),
 69 |         StructField("col2", StringType, nullable = true),
 70 |         StructField("col3", StringType, nullable = true),
 71 |         StructField("col4", StringType, nullable = true),
 72 |         StructField("col5", StringType, nullable = true)
 73 |       )
 74 |     )
 75 | 
 76 |     val dfInput = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
 77 | 
 78 |     val dfFinal = dfInput.union(dfInput)
 79 | 
 80 |     val dfWriter = dfFinal.write
 81 |       .partitionBy("col1")
 82 |       .format("excel")
 83 |       .option("path", targetDir)
 84 |       .option("header", value = true)
 85 |       .mode(SaveMode.Append)
 86 | 
 87 |     dfWriter.save()
 88 |     dfWriter.save()
 89 | 
 90 |     val orderedSchemaColumns = dfInput.schema.fields.map(f => f.name).sorted
 91 | 
 92 |     dfFinal
 93 |       .union(dfFinal)
 94 |       .withColumn("col1", col("col1").cast(IntegerType))
 95 |       .select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
 96 | 
 97 |   }
 98 | 
 99 |   for (run <- Range(0, 3)) {
100 | 
101 |     s"many partitions read (run=$run)" in withExistingCleanTempDir("v2") { targetDir =>
102 |       assume(spark.sparkContext.version >= "3.0.1")
103 |       val expectedDf = createExpected(targetDir)
104 |       assertWrittenExcelData(expectedDf, targetDir)
105 |     }
106 |   }
107 | 
108 | }
109 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/NumericTypesSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql.Row
 21 | import org.apache.spark.sql.types._
 22 | import org.scalatest.funsuite.AnyFunSuite
 23 | 
 24 | import java.util
 25 | import scala.jdk.CollectionConverters._
 26 | 
 27 | /** For schema infering as well as loading for various numeric types {Integer, Long, Double}
 28 |   */
 29 | object NumericTypesSuite {
 30 | 
 31 |   val userDefinedSchema_01 = StructType(
 32 |     List(
 33 |       StructField("Day", IntegerType, true),
 34 |       StructField("Month", IntegerType, true),
 35 |       StructField("Customer ID", StringType, true),
 36 |       StructField("Customer Name", StringType, true),
 37 |       StructField("Standard Package", IntegerType, true),
 38 |       StructField("Extra Option 1", IntegerType, true),
 39 |       StructField("Extra Option 2", IntegerType, true),
 40 |       StructField("Extra Option 3", IntegerType, true),
 41 |       StructField("Staff", StringType, true)
 42 |     )
 43 |   )
 44 | 
 45 |   val expectedData_01: util.List[Row] = List(
 46 |     Row(1, 12, "CA869", "Phạm Uyển Trinh", null, null, 2200, null, "Ella Fitzgerald"),
 47 |     Row(1, 12, "CA870", "Nguyễn Liên Thảo", null, null, 2000, 1350, "Ella Fitzgerald"),
 48 |     Row(1, 12, "CA871", "Lê Thị Nga", 17000, null, null, null, "Ella Fitzgerald"),
 49 |     Row(1, 12, "CA872", "Phan Tố Nga", null, null, 2000, null, "Teresa Teng"),
 50 |     Row(1, 12, "CA873", "Nguyễn Thị Teresa Teng", null, null, 1200, null, "Jesse Thomas")
 51 |   ).asJava
 52 | 
 53 |   val userDefinedSchema_02 = StructType(
 54 |     List(
 55 |       StructField("Day", LongType, true),
 56 |       StructField("Month", LongType, true),
 57 |       StructField("Customer ID", StringType, true),
 58 |       StructField("Customer Name", StringType, true),
 59 |       StructField("Standard Package", IntegerType, true),
 60 |       StructField("Extra Option 1", IntegerType, true),
 61 |       StructField("Extra Option 2", IntegerType, true),
 62 |       StructField("Extra Option 3", LongType, true),
 63 |       StructField("Staff", StringType, true)
 64 |     )
 65 |   )
 66 | 
 67 |   val expectedData_02: util.List[Row] = List(
 68 |     Row(1L, 12L, "CA869", "Phạm Uyển Trinh", null, null, 2200, null, "Ella Fitzgerald"),
 69 |     Row(1L, 12L, "CA870", "Nguyễn Liên Thảo", null, null, 2000, 1350L, "Ella Fitzgerald"),
 70 |     Row(1L, 12L, "CA871", "Lê Thị Nga", 17000, null, null, null, "Ella Fitzgerald"),
 71 |     Row(1L, 12L, "CA872", "Phan Tố Nga", null, null, 2000, null, "Teresa Teng"),
 72 |     Row(1L, 12L, "CA873", "Nguyễn Thị Teresa Teng", null, null, 1200, null, "Jesse Thomas")
 73 |   ).asJava
 74 | }
 75 | 
 76 | class NumericTypesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
 77 |   import NumericTypesSuite._
 78 | 
 79 |   test("load with user defined schema with Integer types") {
 80 |     val df = readFromResources(
 81 |       spark,
 82 |       path = "ca_dataset/2019/Quarter=4/ca_12.xlsx",
 83 |       options = Map("header" -> true),
 84 |       schema = userDefinedSchema_01
 85 |     ).limit(5)
 86 |     val expected = spark.createDataFrame(expectedData_01, userDefinedSchema_01)
 87 | 
 88 |     assertDataFrameEquals(expected, df)
 89 |   }
 90 | 
 91 |   test("load with user defined schema with both Integer and Long types") {
 92 |     val df = readFromResources(
 93 |       spark,
 94 |       path = "ca_dataset/2019/Quarter=4/ca_12.xlsx",
 95 |       options = Map("header" -> true),
 96 |       schema = userDefinedSchema_02
 97 |     ).limit(5)
 98 |     val expected = spark.createDataFrame(expectedData_02, userDefinedSchema_02)
 99 | 
100 |     assertDataFrameEquals(expected, df)
101 |   }
102 | }
103 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/RowNumberColumnSuite.scala:
--------------------------------------------------------------------------------
  1 | /*
  2 |  * Copyright 2022 Martin Mauch (@nightscape)
  3 |  *
  4 |  * Licensed under the Apache License, Version 2.0 (the "License");
  5 |  * you may not use this file except in compliance with the License.
  6 |  * You may obtain a copy of the License at
  7 |  *
  8 |  *     http://www.apache.org/licenses/LICENSE-2.0
  9 |  *
 10 |  * Unless required by applicable law or agreed to in writing, software
 11 |  * distributed under the License is distributed on an "AS IS" BASIS,
 12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 |  * See the License for the specific language governing permissions and
 14 |  * limitations under the License.
 15 |  */
 16 | 
 17 | package dev.mauch.spark.excel.v2
 18 | 
 19 | import dev.mauch.spark.DataFrameSuiteBase
 20 | import org.apache.spark.sql.Row
 21 | import org.apache.spark.sql.types._
 22 | import org.scalatest.funsuite.AnyFunSuite
 23 | 
 24 | import java.util
 25 | import scala.jdk.CollectionConverters._
 26 | 
 27 | /** Related issues: #40 Allow reading only a subset of rows https://github.dev/mauch/spark-excel/issues/40 #59 Rows are
 28 |   * returned in incorrect order on cluster https://github.dev/mauch/spark-excel/issues/59 #115 Add excel row number
 29 |   * column https://github.dev/mauch/spark-excel/issues/115
 30 |   */
 31 | object RowNumberColumnSuite {
 32 | 
 33 |   val expectedSchema = StructType(
 34 |     List(
 35 |       StructField("RowID", IntegerType, true),
 36 |       StructField("1", StringType, true),
 37 |       StructField("2", StringType, true),
 38 |       StructField("3", StringType, true)
 39 |     )
 40 |   )
 41 | 
 42 |   val expectedData_NoKeep: util.List[Row] = List(
 43 |     Row(0, "File info", null, null),
 44 |     Row(1, "Info", "Info", "Info"),
 45 |     Row(3, "Metadata", null, null),
 46 |     Row(5, null, "1", "2"),
 47 |     Row(6, "A", "1", "2"),
 48 |     Row(7, "B", "5", "6"),
 49 |     Row(8, "C", "9", "10"),
 50 |     Row(11, "Metadata", null, null),
 51 |     Row(13, null, "1", "2"),
 52 |     Row(14, "A", "1", "2"),
 53 |     Row(15, "B", "4", "5"),
 54 |     Row(16, "C", "7", "8")
 55 |   ).asJava
 56 | 
 57 |   val expectedData_Keep: util.List[Row] = List(
 58 |     Row(0, "File info", null, null),
 59 |     Row(1, "Info", "Info", "Info"),
 60 |     Row(null, null, null, null),
 61 |     Row(3, "Metadata", null, null),
 62 |     Row(null, null, null, null),
 63 |     Row(5, null, "1", "2"),
 64 |     Row(6, "A", "1", "2"),
 65 |     Row(7, "B", "5", "6"),
 66 |     Row(8, "C", "9", "10"),
 67 |     Row(null, null, null, null),
 68 |     Row(null, null, null, null),
 69 |     Row(11, "Metadata", null, null),
 70 |     Row(null, null, null, null),
 71 |     Row(13, null, "1", "2"),
 72 |     Row(14, "A", "1", "2"),
 73 |     Row(15, "B", "4", "5"),
 74 |     Row(16, "C", "7", "8")
 75 |   ).asJava
 76 | 
 77 |   val expectedSchema_Projection = StructType(
 78 |     List(
 79 |       StructField("3", StringType, true),
 80 |       StructField("RowID", IntegerType, true),
 81 |       StructField("2", StringType, true)
 82 |     )
 83 |   )
 84 | 
 85 |   val expectedData_Projection: util.List[Row] = List(
 86 |     Row(null, 0, null),
 87 |     Row("Info", 1, "Info"),
 88 |     Row(null, 3, null),
 89 |     Row("2", 5, "1"),
 90 |     Row("2", 6, "1"),
 91 |     Row("6", 7, "5"),
 92 |     Row("10", 8, "9"),
 93 |     Row(null, 11, null),
 94 |     Row("2", 13, "1"),
 95 |     Row("2", 14, "1"),
 96 |     Row("5", 15, "4"),
 97 |     Row("8", 16, "7")
 98 |   ).asJava
 99 | 
100 | }
101 | 
102 | class RowNumberColumnSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
103 |   import RowNumberColumnSuite._
104 | 
105 |   test("read with addition excel row number column") {
106 |     val df = readFromResources(
107 |       spark,
108 |       path = "issue_285_bryce21.xlsx",
109 |       Map("header" -> false, "keepUndefinedRows" -> false, "columnNameOfRowNumber" -> "RowID"),
110 |       schema = expectedSchema
111 |     )
112 |     val expected = spark.createDataFrame(expectedData_NoKeep, expectedSchema)
113 |     assertDataFrameEquals(expected, df)
114 |   }
115 | 
116 |   test("read with addition excel row number column, keep undefined rows") {
117 |     val df = readFromResources(
118 |       spark,
119 |       path = "/issue_285_bryce21.xlsx",
120 |       Map("header" -> false, "keepUndefinedRows" -> true, "columnNameOfRowNumber" -> "RowID"),
121 |       schema = expectedSchema
122 |     )
123 |     val expected = spark.createDataFrame(expectedData_Keep, expectedSchema)
124 |     assertDataFrameEquals(expected, df)
125 |   }
126 | 
127 |   test("read with addition excel row number column, projection") {
128 |     val df = readFromResources(
129 |       spark,
130 |       path = "/issue_285_bryce21.xlsx",
131 |       Map("header" -> false, "keepUndefinedRows" -> false, "columnNameOfRowNumber" -> "RowID"),
132 |       schema = expectedSchema
133 |     ).select("3", "RowID", "2")
134 |     val expected = spark.createDataFrame(expectedData_Projection, expectedSchema_Projection)
135 |     assertDataFrameEquals(expected, df)
136 |   }
137 | }
138 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/UserReportedIssuesSuite.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch.spark.excel.v2
18 | 
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 | 
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 | import java.sql.Date
27 | 
28 | object UserReportedIssuesSuite {
29 | 
30 |   /** Issue: https://github.dev/mauch/spark-excel/issues/463 Cannot load Date and Decimal fields
31 |     */
32 |   val userDefined_Issue463 = StructType(
33 |     List(
34 |       StructField("itm no", StringType, true),
35 |       StructField("Expense", DecimalType(23, 10), true),
36 |       StructField("Date", DateType, true)
37 |     )
38 |   )
39 | 
40 |   val expectedData_Issue463: util.List[Row] =
41 |     List(Row("item1", Decimal("1.1"), Date.valueOf("2021-10-01"))).asJava
42 | 
43 | }
44 | 
45 | class UserReportedIssuesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
46 |   import UserReportedIssuesSuite._
47 | 
48 |   test("#463 Date and decimal with user defined schema") {
49 |     val df = readFromResources(
50 |       spark,
51 |       path = "issue_463_cristichircu.xlsx",
52 |       options = Map("header" -> true),
53 |       schema = userDefined_Issue463
54 |     )
55 |     val expected = spark.createDataFrame(expectedData_Issue463, userDefined_Issue463)
56 |     assertDataFrameEquals(expected, df)
57 |   }
58 | }
59 | 


--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/tags/package.scala:
--------------------------------------------------------------------------------
 1 | /*
 2 |  * Copyright 2022 Martin Mauch (@nightscape)
 3 |  *
 4 |  * Licensed under the Apache License, Version 2.0 (the "License");
 5 |  * you may not use this file except in compliance with the License.
 6 |  * You may obtain a copy of the License at
 7 |  *
 8 |  *     http://www.apache.org/licenses/LICENSE-2.0
 9 |  *
10 |  * Unless required by applicable law or agreed to in writing, software
11 |  * distributed under the License is distributed on an "AS IS" BASIS,
12 |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 |  * See the License for the specific language governing permissions and
14 |  * limitations under the License.
15 |  */
16 | 
17 | package dev.mauch
18 | 
19 | import org.scalatest.Tag
20 | 
21 | package object tags {
22 |   object WIP extends Tag("dev.mauch.tags.WIP")
23 | }
24 | 


--------------------------------------------------------------------------------