├── .git-blame-ignore-revs
├── .github
├── ISSUE_TEMPLATE
│ ├── config.yml
│ └── generic.yml
├── dependabot.yml
└── workflows
│ ├── changelog.yaml
│ ├── ci.yml
│ ├── clean.yml
│ ├── dependency-graph.yml
│ ├── potential-duplicates.yml
│ ├── pr-agent.yaml
│ └── rebase.yml
├── .gitignore
├── .mill-jvm-opts
├── .mill-version
├── .scala-steward.conf
├── .scalafmt.conf
├── CHANGELOG.md
├── CONTRIBUTING.md
├── ISSUE_TEMPLATE.md
├── LICENSE
├── README.md
├── build.mill
├── docs
├── README.md
├── azure_synapse.md
└── spark_excel_examples.ipynb
├── mill
├── private-key.pem.enc
├── scalastyle-config.xml
└── src
├── README.md
├── main
├── 2.4
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ ├── ExcelSparkInternal.scala
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelDataSource.scala
│ │ ├── ExcelDateTimeStringUtils.scala
│ │ ├── ExcelFilters.scala
│ │ ├── ExcelOptions.scala
│ │ ├── ExcelParserBase.scala
│ │ ├── FailureSafeParser.scala
│ │ └── SchemaUtils.scala
├── 3.0
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelDateTimeStringUtils.scala
│ │ └── ExcelFilters.scala
├── 3.0_and_up
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelDataSource.scala
│ │ └── ExcelFileFormat.scala
├── 3.0_to_3.1
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelOutputWriter.scala
│ │ ├── ExcelTable.scala
│ │ └── ExcelWriteBuilder.scala
├── 3.0_to_3.2
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelScan.scala
│ │ └── ExcelScanBuilder.scala
├── 3.0_to_3.3
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ └── ExcelOptions.scala
├── 3.0_to_3.4.1
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelParserBase.scala
│ │ └── ExcelPartitionReaderFactory.scala
├── 3.1
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ └── ExcelDateTimeStringUtils.scala
├── 3.1_and_up
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ └── ExcelFilters.scala
├── 3.2_and_up
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelDateTimeStringUtils.scala
│ │ ├── ExcelOutputWriter.scala
│ │ ├── ExcelTable.scala
│ │ └── ExcelWriteBuilder.scala
├── 3.3_and_up
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelScan.scala
│ │ └── ExcelScanBuilder.scala
├── 3.4.2_and_up
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ ├── ExcelParserBase.scala
│ │ └── ExcelPartitionReaderFactory.scala
├── 3.4_and_up
│ └── scala
│ │ └── dev
│ │ └── mauch
│ │ └── spark
│ │ └── excel
│ │ └── v2
│ │ └── ExcelOptions.scala
├── resources
│ └── META-INF
│ │ └── services
│ │ └── org.apache.spark.sql.sources.DataSourceRegister
└── scala
│ └── dev
│ └── mauch
│ └── spark
│ └── excel
│ ├── DataColumn.scala
│ ├── DataLocator.scala
│ ├── DefaultSource.scala
│ ├── DefaultSource15.scala
│ ├── ExcelFileSaver.scala
│ ├── ExcelRelation.scala
│ ├── InferSchema.scala
│ ├── PlainNumberFormat.scala
│ ├── Utils.scala
│ ├── WorkbookReader.scala
│ ├── package.scala
│ └── v2
│ ├── DataLocator.scala
│ ├── ExcelGenerator.scala
│ ├── ExcelHeaderChecker.scala
│ ├── ExcelHelper.scala
│ ├── ExcelInferSchema.scala
│ ├── ExcelOptionsTrait.scala
│ ├── ExcelParser.scala
│ └── SheetData.scala
└── test
├── resources
├── log4j2.properties
└── spreadsheets
│ ├── Issue_747_plain_number.xlsx
│ ├── apache_poi
│ ├── 57231_MixedGasReport.xls
│ └── DataTableCities.xlsx
│ ├── ca_dataset
│ └── 2019
│ │ ├── Quarter=1
│ │ └── ca_03.xlsx
│ │ ├── Quarter=2
│ │ ├── ca_04.xlsx
│ │ ├── ca_05.xlsx
│ │ └── ca_06.xlsx
│ │ ├── Quarter=3
│ │ ├── ca_07.xlsx
│ │ ├── ca_08.xlsx
│ │ └── ca_09.xlsx
│ │ └── Quarter=4
│ │ ├── ca_10.xlsx
│ │ ├── ca_11.xlsx
│ │ └── ca_12.xlsx
│ ├── infer_stricter_numerical_types.xls
│ ├── infer_stricter_numerical_types.xlsx
│ ├── issue_162_nihar_gharat.xlsx
│ ├── issue_285_bryce21.xlsx
│ ├── issue_463_cristichircu.xlsx
│ ├── issue_942_sheetname_digits.xlsx
│ ├── issue_944_faulty_dimension.md
│ ├── issue_944_faulty_dimension.xlsx
│ ├── issue_965_blank_rows.md
│ ├── issue_965_blank_rows.xlsx
│ ├── plain_number.xlsx
│ ├── read_multiple_sheets_at_once.xlsx
│ ├── read_multiple_sheets_at_once_noheader.xlsx
│ ├── simple_encrypted.xls
│ ├── simple_encrypted.xlsx
│ └── with_errors_all_types.xlsx
└── scala
└── dev
└── mauch
├── spark
├── DataFrameSuiteBase.scala
└── excel
│ ├── DataLocatorSuite.scala
│ ├── EncryptedReadSuite.scala
│ ├── ErrorsAsStringsReadSuite.scala
│ ├── Generators.scala
│ ├── IntegrationSuite.scala
│ ├── PlainNumberReadSuite.scala
│ ├── RichRowSuite.scala
│ └── v2
│ ├── AreaReferenceReadSuite.scala
│ ├── DataFrameWriterApiComplianceSuite.scala
│ ├── EncryptedReadSuite.scala
│ ├── ErrorsAsStringsReadSuite.scala
│ ├── ExcelTestingUtilities.scala
│ ├── GlobPartitionAndFileNameSuite.scala
│ ├── InferStricterNumericalTypesSuite.scala
│ ├── KeepUndefinedRowsSuite.scala
│ ├── LocalFileTestingUtilities.scala
│ ├── ManyPartitionReadSuite.scala
│ ├── NumericTypesSuite.scala
│ ├── PlainNumberReadSuite.scala
│ ├── ProjectionAndFilterPushdownSuite.scala
│ ├── RowNumberColumnSuite.scala
│ ├── TableReadSuite.scala
│ ├── UserReportedIssuesSuite.scala
│ └── WriteAndReadSuite.scala
└── tags
└── package.scala
/.git-blame-ignore-revs:
--------------------------------------------------------------------------------
1 | # Scala Steward: Reformat with scalafmt 3.6.1
2 | a834cf94453ed2f3ab1b87818c2fd124fe87fa2a
3 |
4 | # Scala Steward: Reformat with scalafmt 3.7.11
5 | 11269e71a3460ae21f2a96ac8416c0bdd3f1f3b0
6 |
7 | # Scala Steward: Reformat with scalafmt 3.7.15
8 | 17f6ce5807fb3a91938824a285e30f786adea570
9 |
10 | # Scala Steward: Reformat with scalafmt 3.7.17
11 | e4fde8d1e6e34db2d24949275429ce3a7885c2ad
12 |
13 | # Scala Steward: Reformat with scalafmt 3.8.5
14 | 59dd3ea00b8772fd4e8798fde7941c1745ca83f2
15 |
16 | # Scala Steward: Reformat with scalafmt 3.9.5
17 | 19da40630c2645140336554bbce4a48881367bd2
18 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/config.yml:
--------------------------------------------------------------------------------
1 | blank_issues_enabled: false
2 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/generic.yml:
--------------------------------------------------------------------------------
1 | name: 🐞 Bug
2 | description: File a bug/issue
3 | title: "[BUG]
"
4 | labels: [Bug, Needs Triage]
5 | body:
6 | - type: checkboxes
7 | attributes:
8 | label: Am I using the newest version of the library?
9 | description: Please always use the latest version before posting any issues. Your bug might already have been solved..
10 | options:
11 | - label: I have made sure that I'm using the latest version of the library.
12 | required: true
13 | - type: checkboxes
14 | attributes:
15 | label: Is there an existing issue for this?
16 | description: Please search to see if an issue already exists for the bug you encountered.
17 | options:
18 | - label: I have searched the existing issues
19 | required: true
20 | - type: textarea
21 | attributes:
22 | label: Current Behavior
23 | description: A concise description of what you're experiencing.
24 | validations:
25 | required: false
26 | - type: textarea
27 | attributes:
28 | label: Expected Behavior
29 | description: A concise description of what you expected to happen.
30 | validations:
31 | required: false
32 | - type: textarea
33 | attributes:
34 | label: Steps To Reproduce
35 | description: Steps to reproduce the behavior.
36 | placeholder: |
37 | Steps to Reproduce (for bugs)
38 | Provide a link to a live example, or an unambiguous set of steps to reproduce this bug. Include code to reproduce, if relevant. Example:
39 | Download the example file uploaded here
40 | Start Spark from command line as spark-shell --packages dev.mauch:spark-excel_2.12:x.y.z --foo=bar
41 | Read the downloaded example file
42 | val df = spark.read
43 | .format("dev.mauch.spark.excel")
44 | .option("dataAddress", "'My Sheet'!B3:C35")
45 | .load("example_file_exhibiting_bug.xlsx")
46 | validations:
47 | required: false
48 | - type: textarea
49 | attributes:
50 | label: Environment
51 | description: |
52 | examples:
53 | Include as many relevant details about the environment you experienced the bug in
54 | Spark version and language (Scala, Java, Python, R, ...):
55 | Spark-Excel version:
56 | Operating System and versioncluster environment, ...:
57 | value: |
58 | - Spark version:
59 | - Spark-Excel version:
60 | - OS:
61 | - Cluster environment
62 | render: markdown
63 | validations:
64 | required: false
65 | - type: textarea
66 | attributes:
67 | label: Anything else?
68 | description: |
69 | Links? References? Anything that will give us more context about the issue you are encountering!
70 |
71 | Tip: You can attach images or log files by clicking this area to highlight it and then dragging files in.
72 | validations:
73 | required: false
74 |
--------------------------------------------------------------------------------
/.github/dependabot.yml:
--------------------------------------------------------------------------------
1 | # To get started with Dependabot version updates, you'll need to specify which
2 | # package ecosystems to update and where the package manifests are located.
3 | # Please see the documentation for all configuration options:
4 | # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
5 |
6 | version: 2
7 | updates:
8 | - package-ecosystem: "github-actions" # See documentation for possible values
9 | directory: "/" # Location of package manifests
10 | schedule:
11 | interval: "weekly"
12 |
--------------------------------------------------------------------------------
/.github/workflows/changelog.yaml:
--------------------------------------------------------------------------------
1 | name: Changelog
2 |
3 | on:
4 | push:
5 | tags:
6 | - v[0-9]+.[0-9]+.[0-9]+
7 |
8 | jobs:
9 | deploy:
10 | runs-on: ubuntu-latest
11 |
12 | steps:
13 | - name: Checkout Code
14 | uses: actions/checkout@v4
15 |
16 | - name: Update CHANGELOG
17 | id: changelog
18 | uses: Requarks/changelog-action@v1
19 | with:
20 | token: ${{ github.token }}
21 | tag: ${{ github.ref_name }}
22 |
23 | - name: Commit CHANGELOG.md
24 | uses: stefanzweifel/git-auto-commit-action@v5
25 | with:
26 | branch: main
27 | commit_message: 'docs: update CHANGELOG.md for ${{ github.ref_name }} [skip ci]'
28 | file_pattern: CHANGELOG.md
29 |
--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
1 | name: Continuous Integration
2 |
3 | on:
4 | pull_request:
5 | branches: ['**', '!update/**', '!pr/**']
6 | push:
7 | branches: ['**', '!update/**', '!pr/**']
8 | tags: [v*]
9 |
10 | env:
11 | PGP_PASSPHRASE: ${{ secrets.PGP_PASSPHRASE }}
12 | SONATYPE_PASSWORD: ${{ secrets.SONATYPE_PASSWORD }}
13 | SONATYPE_CREDENTIAL_HOST: ${{ secrets.SONATYPE_CREDENTIAL_HOST }}
14 | SONATYPE_USERNAME: ${{ secrets.SONATYPE_USERNAME }}
15 | PGP_SECRET: ${{ secrets.PGP_SECRET }}
16 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
17 |
18 | jobs:
19 | prepare:
20 | runs-on: ubuntu-latest
21 | outputs:
22 | matrix: ${{ steps.set-matrix.outputs.matrix }}
23 | steps:
24 | - name: Checkout
25 | uses: actions/checkout@v4
26 |
27 | - name: Generate matrix
28 | id: set-matrix
29 | run: |
30 | echo -n "matrix=" >> $GITHUB_OUTPUT
31 | ./mill resolve "spark-excel[_,_]" | \
32 | jq -Rsc 'split("\n") | map(capture("spark-excel\\[(?[^,]+),(?[^\\]]+)\\]") | select(.)) | {include: .}' >> $GITHUB_OUTPUT
33 |
34 | build:
35 | needs: prepare
36 | name: Build and Test
37 | strategy:
38 | fail-fast: false
39 | matrix: ${{ fromJson(needs.prepare.outputs.matrix) }}
40 | runs-on: ubuntu-latest
41 | steps:
42 | - name: Checkout current branch (full)
43 | uses: actions/checkout@v4
44 | with:
45 | fetch-depth: 0
46 |
47 | - name: Download Java (temurin@11)
48 | id: download-java-temurin-11
49 | uses: typelevel/download-java@v2
50 | with:
51 | distribution: temurin
52 | java-version: 11
53 |
54 | - name: Setup Java (temurin@11)
55 | uses: actions/setup-java@v4
56 | with:
57 | distribution: jdkfile
58 | java-version: 11
59 | jdkFile: ${{ steps.download-java-temurin-11.outputs.jdkFile }}
60 |
61 | - name: Cache mill
62 | uses: actions/cache@v4
63 | with:
64 | path: |
65 | ~/.mill
66 | ~/.ivy2/cache
67 | ~/.coursier/cache/v1
68 | ~/.cache/coursier/v1
69 | ~/AppData/Local/Coursier/Cache/v1
70 | ~/Library/Caches/Coursier/v1
71 | key: ${{ runner.os }}-mill-cache-v2-${{ hashFiles('**/*.mill') }}-${{ hashFiles('project/build.properties') }}
72 |
73 | - name: Test
74 | run: ./mill spark-excel[${{ matrix.scala }},${{ matrix.spark }}].test
75 |
76 | - name: Publish Test Report
77 | uses: mikepenz/action-junit-report@v5
78 | if: always() # always run even if the previous step fails
79 | with:
80 | fail_on_failure: false
81 | include_passed: false
82 | detailed_summary: true
83 | annotate_only: true
84 | require_tests: false
85 | report_paths: 'out/**/test-report.xml'
86 |
87 | publish:
88 | name: Publish Artifacts
89 | needs: [build]
90 | if: github.event_name != 'pull_request' && (startsWith(github.ref, 'refs/tags/v') || github.ref == 'refs/heads/main')
91 | strategy:
92 | matrix:
93 | os: [ubuntu-latest]
94 | scala: [2.12.20]
95 | java: [temurin@11]
96 | runs-on: ${{ matrix.os }}
97 | steps:
98 | - name: Checkout current branch (full)
99 | uses: actions/checkout@v4
100 | with:
101 | fetch-depth: 0
102 |
103 | - name: Download Java (temurin@11)
104 | id: download-java-temurin-11
105 | if: matrix.java == 'temurin@11'
106 | uses: typelevel/download-java@v2
107 | with:
108 | distribution: temurin
109 | java-version: 11
110 |
111 | - name: Setup Java (temurin@11)
112 | if: matrix.java == 'temurin@11'
113 | uses: actions/setup-java@v4
114 | with:
115 | distribution: jdkfile
116 | java-version: 11
117 | jdkFile: ${{ steps.download-java-temurin-11.outputs.jdkFile }}
118 |
119 | - name: Cache mill
120 | uses: actions/cache@v4
121 | with:
122 | path: |
123 | ~/.mill
124 | ~/.ivy2/cache
125 | ~/.coursier/cache/v1
126 | ~/.cache/coursier/v1
127 | ~/AppData/Local/Coursier/Cache/v1
128 | ~/Library/Caches/Coursier/v1
129 | key: ${{ runner.os }}-mill-cache-v2-${{ hashFiles('**/*.mill') }}-${{ hashFiles('project/build.properties') }}
130 |
131 | - name: Import GPG Key
132 | uses: crazy-max/ghaction-import-gpg@v6
133 | with:
134 | gpg_private_key: ${{ secrets.PGP_SECRET }}
135 | passphrase: ${{ secrets.PGP_PASSPHRASE }}
136 | trust_level: 5
137 |
138 | - name: Publish
139 | run: |
140 | export GPG_TTY=$(tty)
141 | ./mill -i mill.scalalib.SonatypeCentralPublishModule/ \
142 | --username $SONATYPE_USERNAME \
143 | --password $SONATYPE_PASSWORD \
144 | --gpgArgs "--passphrase=$PGP_PASSPHRASE,--no-tty,--pinentry-mode,loopback,--batch,--yes,-a,-b" \
145 | --bundleName dev.mauch-spark-excel-$(date +%Y-%m-%d-%H-%M)
146 |
--------------------------------------------------------------------------------
/.github/workflows/clean.yml:
--------------------------------------------------------------------------------
1 | name: Clean
2 |
3 | on: push
4 |
5 | jobs:
6 | delete-artifacts:
7 | name: Delete Artifacts
8 | runs-on: ubuntu-latest
9 | env:
10 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
11 | steps:
12 | - name: Delete artifacts
13 | run: |
14 | # Customize those three lines with your repository and credentials:
15 | REPO=${GITHUB_API_URL}/repos/${{ github.repository }}
16 |
17 | # A shortcut to call GitHub API.
18 | ghapi() { curl --silent --location --user _:$GITHUB_TOKEN "$@"; }
19 |
20 | # A temporary file which receives HTTP response headers.
21 | TMPFILE=/tmp/tmp.$$
22 |
23 | # An associative array, key: artifact name, value: number of artifacts of that name.
24 | declare -A ARTCOUNT
25 |
26 | # Process all artifacts on this repository, loop on returned "pages".
27 | URL=$REPO/actions/artifacts
28 | while [[ -n "$URL" ]]; do
29 |
30 | # Get current page, get response headers in a temporary file.
31 | JSON=$(ghapi --dump-header $TMPFILE "$URL")
32 |
33 | # Get URL of next page. Will be empty if we are at the last page.
34 | URL=$(grep '^Link:' "$TMPFILE" | tr ',' '\n' | grep 'rel="next"' | head -1 | sed -e 's/.*/' -e 's/>.*//')
35 | rm -f $TMPFILE
36 |
37 | # Number of artifacts on this page:
38 | COUNT=$(( $(jq <<<$JSON -r '.artifacts | length') ))
39 |
40 | # Loop on all artifacts on this page.
41 | for ((i=0; $i < $COUNT; i++)); do
42 |
43 | # Get name of artifact and count instances of this name.
44 | name=$(jq <<<$JSON -r ".artifacts[$i].name?")
45 | ARTCOUNT[$name]=$(( $(( ${ARTCOUNT[$name]} )) + 1))
46 |
47 | id=$(jq <<<$JSON -r ".artifacts[$i].id?")
48 | size=$(( $(jq <<<$JSON -r ".artifacts[$i].size_in_bytes?") ))
49 | printf "Deleting '%s' #%d, %'d bytes\n" $name ${ARTCOUNT[$name]} $size
50 | ghapi -X DELETE $REPO/actions/artifacts/$id
51 | done
52 | done
53 |
--------------------------------------------------------------------------------
/.github/workflows/dependency-graph.yml:
--------------------------------------------------------------------------------
1 | name: github-dependency-graph
2 |
3 | on:
4 | push:
5 | branches:
6 | - main
7 |
8 | jobs:
9 | submit-dependency-graph:
10 | runs-on: ubuntu-latest
11 | steps:
12 | - uses: actions/checkout@v4
13 | - uses: coursier/cache-action@v6
14 | - uses: actions/setup-java@v4
15 | with:
16 | distribution: 'temurin'
17 | java-version: '17'
18 | - uses: ckipp01/mill-dependency-submission@v1
19 |
--------------------------------------------------------------------------------
/.github/workflows/potential-duplicates.yml:
--------------------------------------------------------------------------------
1 | name: Potential Duplicates
2 | on:
3 | issues:
4 | types: [opened, edited]
5 | jobs:
6 | run:
7 | runs-on: ubuntu-latest
8 | steps:
9 | - uses: wow-actions/potential-duplicates@v1
10 | with:
11 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
12 | # Issue title filter work with anymatch https://www.npmjs.com/package/anymatch.
13 | # Any matched issue will stop detection immediately.
14 | # You can specify multi filters in each line.
15 | filter: ''
16 | # Exclude keywords in title before detecting.
17 | exclude: ''
18 | # Label to set, when potential duplicates are detected.
19 | label: potential-duplicate
20 | # Get issues with state to compare. Supported state: 'all', 'closed', 'open'.
21 | state: all
22 | # If similarity is higher than this threshold([0,1]), issue will be marked as duplicate.
23 | threshold: 0.6
24 | # Reactions to be add to comment when potential duplicates are detected.
25 | # Available reactions: "-1", "+1", "confused", "laugh", "heart", "hooray", "rocket", "eyes"
26 | reactions: 'eyes, confused'
27 | # Comment to post when potential duplicates are detected.
28 | comment: >
29 | Please check these potential duplicates: {{#issues}}
30 | - [#{{ number }}] {{ title }} ({{ accuracy }}%)
31 | {{/issues}}
32 |
33 | If this issue is a duplicate, please add any additional info to the ticket with the most information and close this one.
34 |
--------------------------------------------------------------------------------
/.github/workflows/pr-agent.yaml:
--------------------------------------------------------------------------------
1 | on:
2 | pull_request:
3 | issue_comment:
4 | jobs:
5 | pr_agent_job:
6 | runs-on: ubuntu-latest
7 | permissions:
8 | issues: write
9 | pull-requests: write
10 | contents: write
11 | name: Run pr agent on every pull request, respond to user comments
12 | steps:
13 | - name: PR Agent action step
14 | id: pragent
15 | uses: Codium-ai/pr-agent@main
16 | env:
17 | OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
18 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
19 |
--------------------------------------------------------------------------------
/.github/workflows/rebase.yml:
--------------------------------------------------------------------------------
1 | name: Automatic Rebase
2 | on:
3 | issue_comment:
4 | types: [created]
5 | jobs:
6 | rebase:
7 | name: Rebase
8 | if: github.event.issue.pull_request != '' && contains(github.event.comment.body, '/rebase')
9 | runs-on: ubuntu-latest
10 | steps:
11 | - name: Checkout the latest code
12 | uses: actions/checkout@v4
13 | with:
14 | token: ${{ secrets.GITHUB_TOKEN }}
15 | fetch-depth: 0 # otherwise, you will fail to push refs to dest repo
16 | - name: Automatic Rebase
17 | uses: cirrus-actions/rebase@1.8
18 | env:
19 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
20 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | target/
2 | project/target/
3 | project/project/
4 | out/
5 | *.p12
6 | .ensime*
7 | *.swp
8 | .idea
9 | *.log
10 |
11 | .metals/
12 | project/metals.sbt
13 | **/.bsp/
14 | **/.bloop/
15 | .vscode
16 | private-key.pem
17 | .secrets
18 | .~lock.*.xlsx#
19 |
--------------------------------------------------------------------------------
/.mill-jvm-opts:
--------------------------------------------------------------------------------
1 | -Xmx4G
2 |
--------------------------------------------------------------------------------
/.mill-version:
--------------------------------------------------------------------------------
1 | 0.12.14
2 |
--------------------------------------------------------------------------------
/.scala-steward.conf:
--------------------------------------------------------------------------------
1 | updatePullRequests = "always"
2 | commits.message = "chore: Update ${artifactName} from ${currentVersion} to ${nextVersion}"
3 |
--------------------------------------------------------------------------------
/.scalafmt.conf:
--------------------------------------------------------------------------------
1 | version = 3.9.7
2 | style = default
3 | runner.dialect=scala212
4 | maxColumn = 120
5 | continuationIndent.defnSite = 2
6 | continuationIndent.callSite = 2
7 | align.preset = "none"
8 | danglingParentheses.preset = true
9 | optIn.configStyleArguments = false
10 | docstrings.style = SpaceAsterisk
11 | spaces.beforeContextBoundColon = true
12 | rewrite.rules = [SortImports]
13 |
--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing
2 |
3 | When contributing to this repository, please first discuss the change you wish to make via an issue
4 | with the owners of this repository before making a change.
5 |
6 | ## Pull Request Process
7 |
8 | 1. Unless the changes are trivial extensions or bugfixes,
9 | please create an issue proposing what you want to change first.
10 | 2. After coordination with the project maintainers,
11 | go ahead and create the PR.
12 | 3. If you want to do larger refactorings that are not obviously necessary for the PR
13 | please coordinate with the project maintainers first.
14 | We're open to refactorings but would like to discuss and review them independently.
15 | 4. Auto-format your code using `mill mill.scalalib.scalafmt.ScalafmtModule/reformatAll __.sources`.
16 | 5. Run all tests locally using `mill spark-excel[__].test`.
17 | 6. Update the `README.md` and `CHANGELOG.md` with details of changes to the interface.
18 | 7. Rebase your changes to the latest master in case something changed there.
19 |
--------------------------------------------------------------------------------
/ISSUE_TEMPLATE.md:
--------------------------------------------------------------------------------
1 | Your issue may already be reported!
2 | Please search on the [issue track](../) before creating one.
3 | Moreover, please read the [`CHANGELOG.md`](../../blob/master/CHANGELOG.md) file for any changes you might have missed.
4 |
5 | ## Expected Behavior
6 | > If you're describing a bug, tell us what should happen
7 | > If you're suggesting a change/improvement, tell us how it should work
8 |
9 | ## Current Behavior
10 | > If describing a bug, tell us what happens instead of the expected behavior
11 | > If suggesting a change/improvement, explain the difference from current behavior.
12 | > If you have a stack trace or any helpful information from the console, paste it in its entirety.
13 | > If the problem happens with a certain file, upload it somewhere and paste a link.
14 |
15 | ## Possible Solution
16 | > Not obligatory, but suggest a fix/reason for the bug,
17 | > or ideas how to implement the addition or change
18 |
19 | ## Steps to Reproduce (for bugs)
20 | > Provide a link to a live example, or an unambiguous set of steps to
21 | > reproduce this bug. Include code to reproduce, if relevant.
22 | > Example:
23 | 1. Download the example file uploaded [here](http://example.com/)
24 | 2. Start Spark from command line as `spark-shell --packages dev.mauch:spark-excel_2.12:x.y.z --foo=bar`
25 | 3. Read the downloaded example file
26 | ```
27 | val df = spark.read
28 | .format("dev.mauch.spark.excel")
29 | .option("dataAddress", "'My Sheet'!B3:C35")
30 | .load("example_file_exhibiting_bug.xlsx")
31 | ```
32 |
33 | ## Context
34 | > How has this issue affected you? What are you trying to accomplish?
35 | > Providing context helps us come up with a solution that is most useful in the real world
36 |
37 | ## Your Environment
38 | > Include as many relevant details about the environment you experienced the bug in
39 | * Spark version and language (Scala, Java, Python, R, ...):
40 | * Spark-Excel version:
41 | * Operating System and version, cluster environment, ...:
42 |
--------------------------------------------------------------------------------
/build.mill:
--------------------------------------------------------------------------------
1 | import coursier.maven.MavenRepository
2 | import mill._, scalalib._, publish._
3 | import Assembly._
4 | import $ivy.`de.tototec::de.tobiasroeser.mill.vcs.version::0.4.0`
5 | import de.tobiasroeser.mill.vcs.version.VcsVersion
6 |
7 | trait SparkModule extends Cross.Module2[String, String] with SbtModule with SonatypeCentralPublishModule {
8 | outer =>
9 | override def scalaVersion = crossValue
10 | val sparkVersion = crossValue2
11 | val Array(sparkMajor, sparkMinor, sparkPatch) = sparkVersion.split("\\.")
12 | val sparkBinaryVersion = s"$sparkMajor.$sparkMinor"
13 |
14 | override def millSourcePath = super.millSourcePath / os.up
15 |
16 | object LowerOrEqual {
17 | def unapply(otherVersion: String): Boolean = otherVersion match {
18 | case s"${sparkMaj}.${sparkMin}.${sparkPat}" =>
19 | sparkMaj == sparkMajor && (sparkMin < sparkMinor || (sparkMin == sparkMinor && sparkPat <= sparkPatch))
20 | case s"${sparkMaj}.${sparkMin}" => sparkMaj == sparkMajor && sparkMin <= sparkMinor
21 | case sparkMaj => sparkMaj == sparkMajor
22 | }
23 | }
24 | object HigherOrEqual {
25 | def unapply(otherVersion: String): Boolean = otherVersion match {
26 | case s"${sparkMaj}.${sparkMin}.${sparkPat}" =>
27 | sparkMaj == sparkMajor && (sparkMin > sparkMinor || (sparkMin == sparkMinor && sparkPat >= sparkPatch))
28 | case s"${sparkMaj}.${sparkMin}" => sparkMaj == sparkMajor && sparkMin >= sparkMinor
29 | case sparkMaj => sparkMaj == sparkMajor
30 | }
31 | }
32 |
33 | def sparkVersionSpecificSources = T {
34 | val versionSpecificDirs = os.list(mill.api.WorkspaceRoot.workspaceRoot / "src" / "main")
35 | val Array(sparkMajor, sparkMinor, sparkPatch) = sparkVersion.split("\\.")
36 | val sparkBinaryVersion = s"$sparkMajor.$sparkMinor"
37 | versionSpecificDirs.filter(_.last match {
38 | case "scala" => true
39 | case `sparkBinaryVersion` => true
40 | case s"${LowerOrEqual()}_and_up" => true
41 | case s"${LowerOrEqual()}_to_${HigherOrEqual()}" => true
42 | case _ => false
43 | })
44 | }
45 | override def sources = T.sources {
46 | super.sources() ++ sparkVersionSpecificSources().map(PathRef(_))
47 | }
48 |
49 | override def docSources = T.sources(Seq[PathRef]())
50 |
51 | override def artifactName = "spark-excel"
52 |
53 | override def publishVersion: T[String] = T {
54 | val vcsVersion = VcsVersion.vcsState().format(untaggedSuffix = "-SNAPSHOT")
55 | s"${sparkVersion}_${vcsVersion}"
56 | }
57 | def pomSettings = PomSettings(
58 | description = "A Spark plugin for reading and writing Excel files",
59 | organization = "dev.mauch",
60 | url = "https://github.com/nightscape/spark-excel",
61 | licenses = Seq(License.`Apache-2.0`),
62 | versionControl = VersionControl.github("nightscape", "spark-excel"),
63 | developers = Seq(Developer("nightscape", "Martin Mauch", "https://github.com/nightscape"))
64 | )
65 |
66 | def assemblyRules = Seq(
67 | Rule.AppendPattern(".*\\.conf"), // all *.conf files will be concatenated into single file
68 | Rule.Relocate("org.apache.commons.io.**", "shadeio.commons.io.@1"),
69 | Rule.Relocate("org.apache.commons.compress.**", "shadeio.commons.compress.@1")
70 | )
71 |
72 | override def extraPublish = Seq(
73 | PublishInfo(assembly(), classifier = None, ivyConfig = "compile"),
74 | PublishInfo(jar(), classifier = Some("thin"), ivyConfig = "compile")
75 | )
76 |
77 | override def sonatypeCentralReadTimeout: T[Int] = 600000
78 | override def sonatypeCentralAwaitTimeout: T[Int] = 1200 * 1000
79 |
80 | val sparkDeps = Agg(
81 | ivy"org.apache.spark::spark-core:$sparkVersion",
82 | ivy"org.apache.spark::spark-sql:$sparkVersion",
83 | ivy"org.apache.spark::spark-hive:$sparkVersion"
84 | )
85 |
86 | override def compileIvyDeps = if (sparkVersion < "3.3.0") {
87 | sparkDeps ++ Agg(ivy"org.slf4j:slf4j-api:1.7.36".excludeOrg("stax"))
88 | } else {
89 | sparkDeps
90 | }
91 |
92 | val poiVersion = "5.4.1"
93 |
94 | override def ivyDeps = {
95 | val base = Agg(
96 | ivy"org.apache.poi:poi:$poiVersion",
97 | ivy"org.apache.poi:poi-ooxml:$poiVersion",
98 | ivy"org.apache.poi:poi-ooxml-lite:$poiVersion",
99 | ivy"org.apache.xmlbeans:xmlbeans:5.3.0",
100 | ivy"com.norbitltd::spoiwo:2.2.1",
101 | ivy"com.github.pjfanning:excel-streaming-reader:5.1.0",
102 | ivy"commons-io:commons-io:2.19.0",
103 | ivy"org.apache.commons:commons-compress:1.27.1",
104 | ivy"org.apache.logging.log4j:log4j-api:2.24.3",
105 | ivy"com.zaxxer:SparseBitSet:1.3",
106 | ivy"org.apache.commons:commons-collections4:4.5.0",
107 | ivy"com.github.virtuald:curvesapi:1.08",
108 | ivy"commons-codec:commons-codec:1.18.0",
109 | ivy"org.apache.commons:commons-math3:3.6.1",
110 | ivy"org.scala-lang.modules::scala-collection-compat:2.13.0"
111 | )
112 | if (sparkVersion >= "3.3.0") {
113 | base ++ Agg(ivy"org.apache.logging.log4j:log4j-core:2.24.3")
114 | } else {
115 | base
116 | }
117 | }
118 |
119 | object test extends SbtTests with TestModule.ScalaTest {
120 |
121 | override def millSourcePath = super.millSourcePath
122 |
123 | override def sources = T.sources {
124 | Seq(PathRef(millSourcePath / "src" / "test" / "scala"))
125 | }
126 |
127 | override def resources = T.sources {
128 | Seq(PathRef(millSourcePath / "src" / "test" / "resources"))
129 | }
130 |
131 | def scalaVersion = outer.scalaVersion()
132 |
133 | def repositoriesTask = T.task {
134 | super.repositoriesTask() ++ Seq(MavenRepository("https://jitpack.io"))
135 | }
136 |
137 | def ivyDeps = sparkDeps ++ Agg(
138 | ivy"org.typelevel::cats-core:2.13.0",
139 | ivy"org.scalatest::scalatest:3.2.19",
140 | ivy"org.scalatestplus::scalacheck-1-16:3.2.14.0",
141 | ivy"org.scalacheck::scalacheck:1.18.1",
142 | ivy"com.github.alexarchambault::scalacheck-shapeless_1.15:1.3.0",
143 | ivy"com.github.mrpowers::spark-fast-tests:1.3.0",
144 | ivy"org.scalamock::scalamock:5.2.0"
145 | )
146 | }
147 |
148 | }
149 |
150 | val scala213 = "2.13.16"
151 | val scala212 = "2.12.20"
152 | val spark24 = List("2.4.8")
153 | val spark30 = List("3.0.3")
154 | val spark31 = List("3.1.3")
155 | val spark32 = List("3.2.4")
156 | val spark33 = List("3.3.4")
157 | val spark34 = List("3.4.4", "3.4.1")
158 | val spark35 = List("3.5.6")
159 | val sparkVersions = spark24 ++ spark30 ++ spark31 ++ spark32 ++ spark33 ++ spark34 ++ spark35
160 | val crossMatrix =
161 | sparkVersions.map(spark => (scala212, spark)) ++
162 | sparkVersions.filter(_ >= "3.2").map(spark => (scala213, spark))
163 |
164 | object `spark-excel` extends Cross[SparkModule](crossMatrix) {}
165 |
--------------------------------------------------------------------------------
/docs/README.md:
--------------------------------------------------------------------------------
1 | # Documentation
2 |
3 | We need help here! Please send us a PR with any examples or documentation you
4 | care to write that may be of help to others.
5 |
6 | ## Example Notebook
7 |
8 | [spark_excel_examples.ipynb](spark_excel_examples.ipynb) contains examples in
9 | a notebook format.
10 |
11 | ## Azure Synapse
12 |
13 | [azure_synapse.md](azure_synapse.md) has some instructions for loading spark-excel into a
14 | Spark pool for Azure Synapse.
--------------------------------------------------------------------------------
/docs/azure_synapse.md:
--------------------------------------------------------------------------------
1 | # Azure Synapse
2 |
3 | Adding the spark-excel library to the Spark workspace will enable reading
4 | and writing of Excel files to an Azure Storage Account.
5 |
6 | At the time of writing, the following libraries have to be added to the
7 | workspace and then configured for each Spark Pool.
8 |
9 | Each library can be downloaded from [Maven Central](https://search.maven.org)
10 | (thanks Sonatype!).
11 |
12 | * spark-excel_2.12-3.1.2_0.16.5-pre2.jar
13 | * log4j-core-2.17.2.jar
14 | * log4j-api-2.17.2.jar
15 | * xmlbeans-5.0.3.jar
16 | * poi-ooxml-lite-5.2.2.jar
17 | * commons-collections4-4.4.jar
18 |
19 | Once those have been applied, the Excel files can be read into a dataframe like so:
20 |
21 | ```
22 | excel_path = "abfss://@.dfs.core.windows.net/"
23 | df = (spark.read
24 | .format("excel")
25 | .load(excel_path)
26 | )
27 | display(df)
28 | ```
29 |
--------------------------------------------------------------------------------
/private-key.pem.enc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/private-key.pem.enc
--------------------------------------------------------------------------------
/src/README.md:
--------------------------------------------------------------------------------
1 | Spark-excel Source Code Structure
2 | =================================
3 |
4 | Spark-excel, under the hood, there are two implementations. From spark-excel 0.14.0 we added spark-excel V2, which uses Spark Data Source API V2.
5 |
6 | These two implementations are compatible with each other in terms of options and behavior. However, there are features from spark-excel V2 that are not available in original spark-excel implementation, or example: loading multiple Excel files, corrupted record handling eg.
7 |
8 | Spark DataSource API V2 is still under development, since spark 2.3. And to keep spark-excel V2 code to minimum, spark-excel V2 heavily relies on utilities and improvements of each upstream spark version.
9 |
10 | Spark-excel V2 introduces spark-version specific code folder, like:
11 | `2.4/.../spark/v2/excel` for Spark 2.4 Data Source API V2
12 | `3.x/.../spark/v2/excel` for all Spark 3.* Data Source API V2
13 | `3.1_3.2/.../spark/v2/excel` for shared code between Spark 3.1 and Spark 3.2 Data Source API V2
14 |
15 | These structures are also configured into [build.sc](https://github.dev/mauch/spark-excel/blob/main/build.sc#L13), so it can compile for each Spark version.
16 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/ExcelSparkInternal.scala:
--------------------------------------------------------------------------------
1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4 | * the License. You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 | * specific language governing permissions and limitations under the License.
11 | */
12 | package org.apache.spark.nightscape
13 |
14 | import java.nio.file.{Files, Paths}
15 | import org.apache.spark.rdd.InputFileBlockHolder
16 |
17 | /** To provide input-file-name value. The sole purpose of this is for proxying into spark internal implementation of
18 | * InputFileBlockHolder
19 | */
20 | object ExcelSparkInternal {
21 | def setInputFileName(path: String): Unit = {
22 | InputFileBlockHolder.set(path, 0, Files.size(Paths.get(path)))
23 | }
24 | }
25 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4 | * the License. You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 | * specific language governing permissions and limitations under the License.
11 | */
12 | package dev.mauch.spark.excel.v2
13 |
14 | import org.apache.spark.unsafe.types.UTF8String
15 | import org.apache.spark.sql.catalyst.util._
16 | import java.time.ZoneId
17 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
18 |
19 | /** Wrapping the API change between spark 3.0 vs 3.1 */
20 | object ExcelDateTimeStringUtils {
21 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
22 | val str = UTF8String.fromString(v)
23 | DateTimeUtils.stringToTimestamp(str, java.util.TimeZone.getTimeZone(zoneId))
24 | }
25 |
26 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
27 | val str = UTF8String.fromString(v)
28 | DateTimeUtils.stringToDate(str)
29 | }
30 |
31 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter =
32 | TimestampFormatter(options.timestampFormat, java.util.TimeZone.getTimeZone(options.zoneId), options.locale)
33 |
34 | def getDateFormatter(options: ExcelOptions): DateFormatter =
35 | DateFormatter(options.dateFormat, options.locale)
36 | }
37 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala:
--------------------------------------------------------------------------------
1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4 | * the License. You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 | * specific language governing permissions and limitations under the License.
11 | */
12 | package dev.mauch.spark.excel.v2
13 |
14 | import org.apache.spark.sql.sources
15 | import org.apache.spark.sql.types.StructType
16 | import org.apache.spark.sql.catalyst.InternalRow
17 |
18 | /** Wrapping the API change between spark 3.0 vs 3.1 */
19 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType) {
20 | def skipRow(row: InternalRow, index: Int): Boolean = { false }
21 | }
22 |
23 | object ExcelFilters {
24 | def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] =
25 | filters
26 | }
27 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
20 | import org.apache.spark.sql.internal.SQLConf
21 |
22 | class ExcelOptions(
23 | @transient
24 | val parameters: CaseInsensitiveMap[String],
25 | val defaultTimeZoneId: String,
26 | val defaultColumnNameOfCorruptRecord: String
27 | ) extends ExcelOptionsTrait
28 | with Serializable {
29 | // all parameter handling is implemented in ExcelOptionsTrait
30 |
31 | def this(parameters: Map[String, String], defaultTimeZoneId: String) = {
32 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord)
33 | }
34 |
35 | def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = {
36 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord)
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException
20 | import org.apache.spark.unsafe.types.UTF8String
21 | import org.apache.spark.sql.catalyst.InternalRow
22 |
23 | trait ExcelParserBase {
24 |
25 | protected def getCurrentInput: UTF8String
26 | def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException =
27 | BadRecordException(() => getCurrentInput, () => partialResults.headOption, baseException)
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/FailureSafeParser.scala:
--------------------------------------------------------------------------------
1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4 | * the License. You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 | * specific language governing permissions and limitations under the License.
11 | */
12 | package dev.mauch.spark.excel.v2
13 |
14 | import org.apache.spark.SparkException
15 | import org.apache.spark.sql.catalyst.InternalRow
16 | import org.apache.spark.sql.catalyst.expressions.GenericInternalRow
17 | import org.apache.spark.sql.types.StructType
18 | import org.apache.spark.unsafe.types.UTF8String
19 | import org.apache.spark.sql.catalyst.util._
20 |
21 | class FailureSafeParser[IN](
22 | rawParser: IN => Iterable[InternalRow],
23 | mode: ParseMode,
24 | schema: StructType,
25 | columnNameOfCorruptRecord: String
26 | ) {
27 |
28 | private val corruptFieldIndex =
29 | if (schema.fieldNames.contains(columnNameOfCorruptRecord)) {
30 | Some(schema.fieldIndex(columnNameOfCorruptRecord))
31 | } else None
32 |
33 | private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord))
34 | private val resultRow = new GenericInternalRow(schema.length)
35 | private val nullResult = new GenericInternalRow(schema.length)
36 |
37 | // This function takes 2 parameters: an optional partial result, and the bad record. If the given
38 | // schema doesn't contain a field for corrupted record, we just return the partial result or a
39 | // row with all fields null. If the given schema contains a field for corrupted record, we will
40 | // set the bad record to this field, and set other fields according to the partial result or null.
41 | private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = {
42 | if (corruptFieldIndex.isDefined) { (row, badRecord) =>
43 | {
44 | var i = 0
45 | while (i < actualSchema.length) {
46 | val from = actualSchema(i)
47 | resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull
48 | i += 1
49 | }
50 | resultRow(corruptFieldIndex.get) = badRecord()
51 | resultRow
52 | }
53 | } else { (row, _) => row.getOrElse(nullResult) }
54 | }
55 |
56 | def parse(input: IN): Iterator[InternalRow] = {
57 | try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) }
58 | catch {
59 | case e: BadRecordException =>
60 | mode match {
61 | case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record))
62 | case DropMalformedMode => Iterator.empty
63 | case FailFastMode =>
64 | throw new SparkException(
65 | "Malformed records are detected in record parsing. " +
66 | s"Parse Mode: ${FailFastMode.name}. To process malformed records as null " +
67 | "result, try setting the option 'mode' as 'PERMISSIVE'.",
68 | e
69 | )
70 | }
71 | }
72 | }
73 | }
74 |
--------------------------------------------------------------------------------
/src/main/2.4/scala/dev/mauch/spark/excel/v2/SchemaUtils.scala:
--------------------------------------------------------------------------------
1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4 | * the License. You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 | * specific language governing permissions and limitations under the License.
11 | */
12 | package dev.mauch.spark.excel.v2
13 |
14 | import org.apache.spark.sql.catalyst.analysis._
15 | import org.apache.spark.sql.types.StructType
16 |
17 | /** Utils for handling schemas. (Copied from spark.util)
18 | */
19 | object SchemaUtils {
20 |
21 | /** Checks if an input schema has duplicate column names. This throws an exception if the duplication exists.
22 | *
23 | * @param schema
24 | * schema to check
25 | * @param colType
26 | * column type name, used in an exception message
27 | * @param caseSensitiveAnalysis
28 | * whether duplication checks should be case sensitive or not
29 | */
30 | def checkSchemaColumnNameDuplication(
31 | schema: StructType,
32 | colType: String,
33 | caseSensitiveAnalysis: Boolean = false
34 | ): Unit = { checkColumnNameDuplication(schema.map(_.name), colType, caseSensitiveAnalysis) }
35 |
36 | // Returns true if a given resolver is case-sensitive
37 | private def isCaseSensitiveAnalysis(resolver: Resolver): Boolean = {
38 | if (resolver == caseSensitiveResolution) { true }
39 | else if (resolver == caseInsensitiveResolution) { false }
40 | else {
41 | sys.error(
42 | "A resolver to check if two identifiers are equal must be " +
43 | "`caseSensitiveResolution` or `caseInsensitiveResolution` in o.a.s.sql.catalyst."
44 | )
45 | }
46 | }
47 |
48 | /** Checks if input column names have duplicate identifiers. This throws an exception if the duplication exists.
49 | *
50 | * @param columnNames
51 | * column names to check
52 | * @param colType
53 | * column type name, used in an exception message
54 | * @param resolver
55 | * resolver used to determine if two identifiers are equal
56 | */
57 | def checkColumnNameDuplication(columnNames: Seq[String], colType: String, resolver: Resolver): Unit = {
58 | checkColumnNameDuplication(columnNames, colType, isCaseSensitiveAnalysis(resolver))
59 | }
60 |
61 | /** Checks if input column names have duplicate identifiers. This throws an exception if the duplication exists.
62 | *
63 | * @param columnNames
64 | * column names to check
65 | * @param colType
66 | * column type name, used in an exception message
67 | * @param caseSensitiveAnalysis
68 | * whether duplication checks should be case sensitive or not
69 | */
70 | def checkColumnNameDuplication(columnNames: Seq[String], colType: String, caseSensitiveAnalysis: Boolean): Unit = {
71 | val names = if (caseSensitiveAnalysis) columnNames else columnNames.map(_.toLowerCase)
72 | if (names.distinct.length != names.length) {
73 | val duplicateColumns = names
74 | .groupBy(identity)
75 | .collect { case (x, ys) if ys.length > 1 => s"`$x`" }
76 | throw new RuntimeException(s"Found duplicate column(s) $colType: ${duplicateColumns.mkString(", ")}")
77 | }
78 | }
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/3.0/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.unsafe.types.UTF8String
20 | import org.apache.spark.sql.catalyst.util._
21 | import java.time.ZoneId
22 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
23 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
24 |
25 | /** Wrapping the API change between spark 3.0 vs 3.1 */
26 | object ExcelDateTimeStringUtils {
27 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
28 | val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(v))
29 | DateTimeUtils.stringToTimestamp(str, zoneId)
30 | }
31 |
32 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
33 | val str = UTF8String.fromString(DateTimeUtils.cleanLegacyTimestampStr(v))
34 | DateTimeUtils.stringToDate(str, zoneId)
35 | }
36 |
37 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter(
38 | options.timestampFormat,
39 | options.zoneId,
40 | options.locale,
41 | legacyFormat = FAST_DATE_FORMAT,
42 | isParsing = true
43 | )
44 |
45 | def getDateFormatter(options: ExcelOptions): DateFormatter =
46 | DateFormatter(options.dateFormat, options.zoneId, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true)
47 | }
48 |
--------------------------------------------------------------------------------
/src/main/3.0/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.catalyst.csv.CSVFilters
20 | import org.apache.spark.sql.sources
21 | import org.apache.spark.sql.types.StructType
22 |
23 | /** Wrapping the API change between spark 3.0 vs 3.1 */
24 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType)
25 | extends CSVFilters(filters, requiredSchema) {}
26 |
27 | object ExcelFilters {
28 | def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] =
29 | CSVFilters.pushedFilters(filters, schema)
30 | }
31 |
--------------------------------------------------------------------------------
/src/main/3.0_and_up/scala/dev/mauch/spark/excel/v2/ExcelDataSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.connector.catalog.Table
20 | import org.apache.spark.sql.execution.datasources.FileFormat
21 | import org.apache.spark.sql.execution.datasources.v2.FileDataSourceV2
22 | import org.apache.spark.sql.types.StructType
23 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
24 |
25 | /** Derived from Spark own CSV implementation
26 | */
27 | class ExcelDataSource extends FileDataSourceV2 {
28 |
29 | override def fallbackFileFormat: Class[_ <: FileFormat] = classOf[ExcelFileFormat]
30 |
31 | override def getTable(options: CaseInsensitiveStringMap): Table = {
32 | val paths = getPaths(options)
33 | val tableName = getTableName(options, paths)
34 | val optionsWithoutPaths = getOptionsWithoutPaths(options)
35 | ExcelTable(tableName, sparkSession, optionsWithoutPaths, paths, None)
36 | }
37 |
38 | override def getTable(options: CaseInsensitiveStringMap, schema: StructType): Table = {
39 | val paths = getPaths(options)
40 | val tableName = getTableName(options, paths)
41 | val optionsWithoutPaths = getOptionsWithoutPaths(options)
42 | ExcelTable(tableName, sparkSession, optionsWithoutPaths, paths, Some(schema))
43 | }
44 |
45 | /** The string that represents the format that this data source provider uses
46 | */
47 | override def shortName(): String = "excel"
48 |
49 | }
50 |
--------------------------------------------------------------------------------
/src/main/3.0_and_up/scala/dev/mauch/spark/excel/v2/ExcelFileFormat.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.hadoop.fs.{FileStatus, Path}
21 | import org.apache.hadoop.mapreduce.{Job, TaskAttemptContext}
22 | import org.apache.spark.sql.SparkSession
23 | import org.apache.spark.sql.catalyst.InternalRow
24 | import org.apache.spark.sql.execution.datasources.{FileFormat, OutputWriter, OutputWriterFactory, PartitionedFile}
25 | import org.apache.spark.sql.sources.{DataSourceRegister, Filter}
26 | import org.apache.spark.sql.types._
27 |
28 | /** derived from binary file data source. Needed to support writing excel using the V2 API
29 | */
30 | class ExcelFileFormat extends FileFormat with DataSourceRegister {
31 |
32 | override def inferSchema(
33 | sparkSession: SparkSession,
34 | options: Map[String, String],
35 | files: Seq[FileStatus]
36 | ): Option[StructType] = {
37 | throw new UnsupportedOperationException("ExcelFileFormat as fallback format for V2 supports writing only")
38 | }
39 |
40 | override def prepareWrite(
41 | sparkSession: SparkSession,
42 | job: Job,
43 | options: Map[String, String],
44 | dataSchema: StructType
45 | ): OutputWriterFactory = {
46 | val excelOptions = new ExcelOptions(options, sparkSession.conf.get("spark.sql.session.timeZone"))
47 |
48 | new OutputWriterFactory {
49 | override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
50 | new ExcelOutputWriter(path, dataSchema, context, excelOptions)
51 | }
52 |
53 | override def getFileExtension(context: TaskAttemptContext): String =
54 | s".${excelOptions.fileExtension}"
55 | }
56 | }
57 |
58 | override def isSplitable(sparkSession: SparkSession, options: Map[String, String], path: Path): Boolean = {
59 | false
60 | }
61 |
62 | override def shortName(): String = "excel"
63 |
64 | /*
65 | We need this class for writing only, thus reader is not implemented
66 | */
67 | override protected def buildReader(
68 | sparkSession: SparkSession,
69 | dataSchema: StructType,
70 | partitionSchema: StructType,
71 | requiredSchema: StructType,
72 | filters: Seq[Filter],
73 | options: Map[String, String],
74 | hadoopConf: Configuration
75 | ): PartitionedFile => Iterator[InternalRow] = {
76 | throw new UnsupportedOperationException("ExcelFileFormat as fallback format for V2 supports writing only")
77 | }
78 |
79 | }
80 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelOutputWriter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.mapreduce.TaskAttemptContext
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.types.StructType
24 |
25 | class ExcelOutputWriter(path: String, dataSchema: StructType, context: TaskAttemptContext, options: ExcelOptions)
26 | extends OutputWriter
27 | with Logging {
28 |
29 | private val gen = new ExcelGenerator(path, dataSchema, context.getConfiguration, options)
30 | if (options.header) { gen.writeHeaders() }
31 |
32 | override def write(row: InternalRow): Unit = gen.write(row)
33 |
34 | override def close(): Unit = gen.close()
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelTable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.fs.FileStatus
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
22 | import org.apache.spark.sql.connector.write.WriteBuilder
23 | import org.apache.spark.sql.execution.datasources.FileFormat
24 | import org.apache.spark.sql.execution.datasources.v2.FileTable
25 | import org.apache.spark.sql.types.DataType
26 | import org.apache.spark.sql.types.StructType
27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
28 | import org.apache.spark.sql.connector.catalog.TableCapability
29 | import org.apache.spark.sql.connector.catalog.TableCapability._
30 | import scala.jdk.CollectionConverters._
31 |
32 | case class ExcelTable(
33 | name: String,
34 | sparkSession: SparkSession,
35 | map: CaseInsensitiveStringMap,
36 | paths: Seq[String],
37 | userSpecifiedSchema: Option[StructType]
38 | ) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) {
39 |
40 | override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder =
41 | ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params)
42 |
43 | override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
44 | val options =
45 | new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone)
46 |
47 | if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty))
48 | }
49 |
50 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
51 | new ExcelWriteBuilder(paths, formatName, supportsDataType, info)
52 |
53 | override def supportsDataType(dataType: DataType): Boolean = true
54 |
55 | override def formatName: String = "Excel"
56 |
57 | override def fallbackFileFormat: Class[_ <: FileFormat] =
58 | throw new UnsupportedOperationException("Excel does not support V1 File Format")
59 |
60 | override def capabilities: java.util.Set[TableCapability] =
61 | Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava
62 |
63 | /* Actual doing schema inferring */
64 | private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = {
65 | val excelHelper = ExcelHelper(options)
66 | val conf = sparkSession.sessionState.newHadoopConf()
67 |
68 | /** Sampling ratio on file level (not row level as in CSV) */
69 | val paths = {
70 | var sample = (inputPaths.size * options.samplingRatio).intValue
71 | sample = if (sample < 1) 1 else sample
72 | inputPaths.take(sample).map(_.getPath.toUri)
73 | }
74 | val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths)
75 | try {
76 | if (sheetData.rowIterator.isEmpty) {
77 | /* If the first file is empty, not checking further */
78 | StructType(Seq.empty)
79 | } else {
80 | /* Ready to infer schema */
81 | ExcelInferSchema(options).infer(sheetData.rowIterator, colNames)
82 | }
83 | } finally {
84 | sheetData.close()
85 | }
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.1/scala/dev/mauch/spark/excel/v2/ExcelWriteBuilder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.mapreduce.Job
20 | import org.apache.hadoop.mapreduce.TaskAttemptContext
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.execution.datasources.OutputWriterFactory
24 | import org.apache.spark.sql.execution.datasources.v2.FileWriteBuilder
25 | import org.apache.spark.sql.internal.SQLConf
26 | import org.apache.spark.sql.types.DataType
27 | import org.apache.spark.sql.types.StructType
28 |
29 | class ExcelWriteBuilder(
30 | paths: Seq[String],
31 | formatName: String,
32 | supportsDataType: DataType => Boolean,
33 | info: LogicalWriteInfo
34 | ) extends FileWriteBuilder(paths, formatName, supportsDataType, info) {
35 | override def prepareWrite(
36 | sqlConf: SQLConf,
37 | job: Job,
38 | options: Map[String, String],
39 | dataSchema: StructType
40 | ): OutputWriterFactory = {
41 |
42 | val excelOptions = new ExcelOptions(options, sqlConf.sessionLocalTimeZone)
43 |
44 | new OutputWriterFactory {
45 | override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
46 | new ExcelOutputWriter(path, dataSchema, context, excelOptions)
47 | }
48 |
49 | override def getFileExtension(context: TaskAttemptContext): String =
50 | s".${excelOptions.fileExtension}"
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.2/scala/dev/mauch/spark/excel/v2/ExcelScan.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression}
22 | import org.apache.spark.sql.connector.read.PartitionReaderFactory
23 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
24 | import org.apache.spark.sql.execution.datasources.v2.{FileScan, TextBasedFileScan}
25 | import org.apache.spark.sql.sources.Filter
26 | import org.apache.spark.sql.types.StructType
27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
28 | import org.apache.spark.util.SerializableConfiguration
29 |
30 | import scala.collection.compat.immutable.ArraySeq
31 | import scala.jdk.CollectionConverters._
32 |
33 | case class ExcelScan(
34 | sparkSession: SparkSession,
35 | fileIndex: PartitioningAwareFileIndex,
36 | dataSchema: StructType,
37 | readDataSchema: StructType,
38 | readPartitionSchema: StructType,
39 | options: CaseInsensitiveStringMap,
40 | pushedFilters: Array[Filter],
41 | partitionFilters: Seq[Expression] = Seq.empty,
42 | dataFilters: Seq[Expression] = Seq.empty
43 | ) extends TextBasedFileScan(sparkSession, options) {
44 |
45 | private lazy val parsedOptions: ExcelOptions = new ExcelOptions(
46 | options.asScala.toMap,
47 | sparkSession.sessionState.conf.sessionLocalTimeZone,
48 | sparkSession.sessionState.conf.columnNameOfCorruptRecord
49 | )
50 |
51 | override def isSplitable(path: Path): Boolean = false
52 |
53 | override def getFileUnSplittableReason(path: Path): String = {
54 | "No practical method of splitting an excel file"
55 | }
56 |
57 | override def createReaderFactory(): PartitionReaderFactory = {
58 |
59 | /* Check a field requirement for corrupt records here to throw an exception in a driver side
60 | */
61 | ExprUtils.verifyColumnNameOfCorruptRecord(dataSchema, parsedOptions.columnNameOfCorruptRecord)
62 |
63 | if (
64 | readDataSchema.length == 1 &&
65 | readDataSchema.head.name == parsedOptions.columnNameOfCorruptRecord
66 | ) {
67 | throw new RuntimeException(
68 | "Queries from raw Excel files are disallowed when the referenced " +
69 | "columns only include the internal corrupt record column"
70 | )
71 | }
72 |
73 | val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
74 |
75 | /* Hadoop Configurations are case sensitive. */
76 | val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
77 |
78 | val broadcastedConf = sparkSession.sparkContext
79 | .broadcast(new SerializableConfiguration(hadoopConf))
80 |
81 | /* The partition values are already truncated in `FileScan.partitions`. We should use `readPartitionSchema` as the
82 | * partition schema here.
83 | */
84 | ExcelPartitionReaderFactory(
85 | sparkSession.sessionState.conf,
86 | broadcastedConf,
87 | dataSchema,
88 | readDataSchema,
89 | readPartitionSchema,
90 | parsedOptions,
91 | ArraySeq.unsafeWrapArray(pushedFilters)
92 | )
93 | }
94 |
95 | override def withFilters(partitionFilters: Seq[Expression], dataFilters: Seq[Expression]): FileScan =
96 | this.copy(partitionFilters = partitionFilters, dataFilters = dataFilters)
97 |
98 | override def equals(obj: Any): Boolean = obj match {
99 | case c: ExcelScan =>
100 | super.equals(c) && dataSchema == c.dataSchema && options == c.options &&
101 | equivalentFilters(pushedFilters, c.pushedFilters)
102 | case _ => false
103 | }
104 |
105 | override def hashCode(): Int = super.hashCode()
106 |
107 | override def description(): String = {
108 | super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]")
109 | }
110 | }
111 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.2/scala/dev/mauch/spark/excel/v2/ExcelScanBuilder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.connector.read.{Scan, SupportsPushDownFilters}
21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
22 | import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
23 | import org.apache.spark.sql.sources.Filter
24 | import org.apache.spark.sql.types.StructType
25 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
26 |
27 | case class ExcelScanBuilder(
28 | sparkSession: SparkSession,
29 | fileIndex: PartitioningAwareFileIndex,
30 | schema: StructType,
31 | dataSchema: StructType,
32 | options: CaseInsensitiveStringMap
33 | ) extends FileScanBuilder(sparkSession, fileIndex, dataSchema)
34 | with SupportsPushDownFilters {
35 |
36 | override def build(): Scan = {
37 | ExcelScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedFilters())
38 | }
39 |
40 | private var _pushedFilters: Array[Filter] = Array.empty
41 |
42 | override def pushFilters(filters: Array[Filter]): Array[Filter] = {
43 | _pushedFilters = ExcelFilters.pushedFilters(filters, dataSchema)
44 | filters
45 | }
46 |
47 | override def pushedFilters(): Array[Filter] = _pushedFilters
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.3/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
20 | import org.apache.spark.sql.internal.SQLConf
21 |
22 | class ExcelOptions(
23 | @transient
24 | val parameters: CaseInsensitiveMap[String],
25 | val defaultTimeZoneId: String,
26 | val defaultColumnNameOfCorruptRecord: String
27 | ) extends ExcelOptionsTrait
28 | with Serializable {
29 | // all parameter handling is implemented in ExcelOptionsTrait
30 |
31 | def this(parameters: Map[String, String], defaultTimeZoneId: String) = {
32 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord)
33 | }
34 |
35 | def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = {
36 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord)
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.4.1/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException
20 | import org.apache.spark.unsafe.types.UTF8String
21 | import org.apache.spark.sql.catalyst.InternalRow
22 |
23 | trait ExcelParserBase {
24 |
25 | protected def getCurrentInput: UTF8String
26 | def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException =
27 | BadRecordException(() => getCurrentInput, () => partialResults.headOption, baseException)
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/3.0_to_3.4.1/scala/dev/mauch/spark/excel/v2/ExcelPartitionReaderFactory.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.spark.broadcast.Broadcast
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.connector.read.PartitionReader
23 | import org.apache.spark.sql.execution.datasources.PartitionedFile
24 | import org.apache.spark.sql.execution.datasources.v2._
25 | import org.apache.spark.sql.internal.SQLConf
26 | import org.apache.spark.sql.sources.Filter
27 | import org.apache.spark.sql.types.StructType
28 | import org.apache.spark.util.SerializableConfiguration
29 |
30 | import java.net.URI
31 | import scala.util.control.NonFatal
32 |
33 | /** A factory used to create Excel readers.
34 | *
35 | * @param sqlConf
36 | * SQL configuration.
37 | * @param broadcastedConf
38 | * Broadcasted serializable Hadoop Configuration.
39 | * @param dataSchema
40 | * Schema of Excel files.
41 | * @param readDataSchema
42 | * Required data schema in the batch scan.
43 | * @param partitionSchema
44 | * Schema of partitions.
45 | * @param options
46 | * Options for parsing Excel files.
47 | */
48 | case class ExcelPartitionReaderFactory(
49 | sqlConf: SQLConf,
50 | broadcastedConf: Broadcast[SerializableConfiguration],
51 | dataSchema: StructType,
52 | readDataSchema: StructType,
53 | partitionSchema: StructType,
54 | options: ExcelOptions,
55 | filters: Seq[Filter]
56 | ) extends FilePartitionReaderFactory {
57 |
58 | override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = {
59 | val conf = broadcastedConf.value.value
60 | val actualDataSchema =
61 | StructType(dataSchema.filterNot(_.name == options.columnNameOfCorruptRecord))
62 | val actualReadDataSchema =
63 | StructType(readDataSchema.filterNot(_.name == options.columnNameOfCorruptRecord))
64 | val parser = new ExcelParser(actualDataSchema, actualReadDataSchema, options, filters)
65 | val headerChecker =
66 | new ExcelHeaderChecker(actualReadDataSchema, options, source = s"Excel file: ${file.filePath}")
67 | val iter = readFile(conf, file, parser, headerChecker, readDataSchema)
68 | val partitionReader = new SparkExcelPartitionReaderFromIterator(iter)
69 | new PartitionReaderWithPartitionValues(partitionReader, readDataSchema, partitionSchema, file.partitionValues)
70 | }
71 |
72 | private def readFile(
73 | conf: Configuration,
74 | file: PartitionedFile,
75 | parser: ExcelParser,
76 | headerChecker: ExcelHeaderChecker,
77 | requiredSchema: StructType
78 | ): SheetData[InternalRow] = {
79 | val excelHelper = ExcelHelper(options)
80 | val sheetData = excelHelper.getSheetData(conf, URI.create(file.filePath.toString))
81 | try {
82 | SheetData(
83 | ExcelParser.parseIterator(sheetData.rowIterator, parser, headerChecker, requiredSchema),
84 | sheetData.resourcesToClose
85 | )
86 | } catch {
87 | case NonFatal(t) => {
88 | sheetData.close()
89 | throw t
90 | }
91 | }
92 | }
93 |
94 | }
95 |
96 | private class SparkExcelPartitionReaderFromIterator(sheetData: SheetData[InternalRow])
97 | extends PartitionReaderFromIterator[InternalRow](sheetData.rowIterator) {
98 | override def close(): Unit = {
99 | super.close()
100 | sheetData.close()
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/main/3.1/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
1 | /** Copyright 2016 - 2021 Martin Mauch (@nightscape)
2 | *
3 | * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4 | * the License. You may obtain a copy of the License at
5 | *
6 | * http://www.apache.org/licenses/LICENSE-2.0
7 | *
8 | * Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9 | * an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10 | * specific language governing permissions and limitations under the License.
11 | */
12 | package dev.mauch.spark.excel.v2
13 |
14 | import org.apache.spark.unsafe.types.UTF8String
15 | import org.apache.spark.sql.catalyst.util._
16 | import java.time.ZoneId
17 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
18 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
19 |
20 | object ExcelDateTimeStringUtils {
21 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
22 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
23 | DateTimeUtils.stringToTimestamp(str, zoneId)
24 | }
25 |
26 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
27 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
28 | DateTimeUtils.stringToDate(str, zoneId)
29 | }
30 |
31 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter(
32 | options.timestampFormat,
33 | options.zoneId,
34 | options.locale,
35 | legacyFormat = FAST_DATE_FORMAT,
36 | isParsing = true
37 | )
38 |
39 | def getDateFormatter(options: ExcelOptions): DateFormatter =
40 | DateFormatter(options.dateFormat, options.zoneId, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true)
41 |
42 | }
43 |
--------------------------------------------------------------------------------
/src/main/3.1_and_up/scala/dev/mauch/spark/excel/v2/ExcelFilters.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.catalyst.OrderedFilters
20 | import org.apache.spark.sql.catalyst.StructFilters
21 | import org.apache.spark.sql.sources
22 | import org.apache.spark.sql.types.StructType
23 |
24 | /** Wrapping the API change between spark 3.0 vs 3.1 */
25 | class ExcelFilters(filters: Seq[sources.Filter], requiredSchema: StructType)
26 | extends OrderedFilters(filters, requiredSchema) {}
27 |
28 | object ExcelFilters {
29 | def pushedFilters(filters: Array[sources.Filter], schema: StructType): Array[sources.Filter] =
30 | StructFilters.pushedFilters(filters, schema)
31 | }
32 |
--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelDateTimeStringUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.unsafe.types.UTF8String
20 | import org.apache.spark.sql.catalyst.util._
21 |
22 | import java.time.ZoneId
23 | import org.apache.spark.sql.catalyst.util.DateFormatter
24 | import org.apache.spark.sql.catalyst.util.TimestampFormatter
25 | import org.apache.spark.sql.catalyst.util.LegacyDateFormats.FAST_DATE_FORMAT
26 |
27 | import scala.annotation.nowarn
28 |
29 | object ExcelDateTimeStringUtils {
30 | def stringToTimestamp(v: String, zoneId: ZoneId): Option[Long] = {
31 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
32 | DateTimeUtils.stringToTimestamp(str, zoneId)
33 | }
34 |
35 | @nowarn
36 | def stringToDate(v: String, zoneId: ZoneId): Option[Int] = {
37 | val str = DateTimeUtils.cleanLegacyTimestampStr(UTF8String.fromString(v))
38 | DateTimeUtils.stringToDate(str)
39 | }
40 |
41 | def getTimestampFormatter(options: ExcelOptions): TimestampFormatter = TimestampFormatter(
42 | options.timestampFormat,
43 | options.zoneId,
44 | options.locale,
45 | legacyFormat = FAST_DATE_FORMAT,
46 | isParsing = true
47 | )
48 |
49 | def getDateFormatter(options: ExcelOptions): DateFormatter =
50 | DateFormatter(options.dateFormat, options.locale, legacyFormat = FAST_DATE_FORMAT, isParsing = true)
51 |
52 | }
53 |
--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelOutputWriter.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.mapreduce.TaskAttemptContext
20 | import org.apache.spark.internal.Logging
21 | import org.apache.spark.sql.catalyst.InternalRow
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.types.StructType
24 |
25 | class ExcelOutputWriter(val path: String, dataSchema: StructType, context: TaskAttemptContext, options: ExcelOptions)
26 | extends OutputWriter
27 | with Logging {
28 |
29 | private val gen = new ExcelGenerator(path, dataSchema, context.getConfiguration, options)
30 | if (options.header) { gen.writeHeaders() }
31 |
32 | override def write(row: InternalRow): Unit = gen.write(row)
33 |
34 | override def close(): Unit = gen.close()
35 | }
36 |
--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelTable.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.fs.FileStatus
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.connector.write.Write
22 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
23 | import org.apache.spark.sql.connector.write.WriteBuilder
24 | import org.apache.spark.sql.execution.datasources.FileFormat
25 | import org.apache.spark.sql.execution.datasources.v2.FileTable
26 | import org.apache.spark.sql.types.DataType
27 | import org.apache.spark.sql.types.StructType
28 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
29 | import org.apache.spark.sql.connector.catalog.TableCapability
30 | import org.apache.spark.sql.connector.catalog.TableCapability._
31 | import scala.jdk.CollectionConverters._
32 |
33 | case class ExcelTable(
34 | name: String,
35 | sparkSession: SparkSession,
36 | map: CaseInsensitiveStringMap,
37 | paths: Seq[String],
38 | userSpecifiedSchema: Option[StructType]
39 | ) extends FileTable(sparkSession, map, paths, userSpecifiedSchema) {
40 |
41 | override def newScanBuilder(params: CaseInsensitiveStringMap): ExcelScanBuilder =
42 | ExcelScanBuilder(sparkSession, fileIndex, schema, dataSchema, params)
43 |
44 | override def inferSchema(files: Seq[FileStatus]): Option[StructType] = {
45 | val options =
46 | new ExcelOptions(map.asScala.toMap, sparkSession.sessionState.conf.sessionLocalTimeZone)
47 |
48 | if (files.nonEmpty) Some(infer(sparkSession, files, options)) else Some(StructType(Seq.empty))
49 | }
50 |
51 | override def newWriteBuilder(info: LogicalWriteInfo): WriteBuilder =
52 | new WriteBuilder {
53 | override def build(): Write = ExcelWriteBuilder(paths, formatName, supportsDataType, info)
54 | }
55 |
56 | override def supportsDataType(dataType: DataType): Boolean = true
57 |
58 | override def formatName: String = "Excel"
59 |
60 | override def fallbackFileFormat: Class[_ <: FileFormat] =
61 | throw new UnsupportedOperationException("Excel does not support V1 File Format")
62 |
63 | override def capabilities: java.util.Set[TableCapability] =
64 | Set(ACCEPT_ANY_SCHEMA, BATCH_READ, BATCH_WRITE, OVERWRITE_BY_FILTER, TRUNCATE).asJava
65 |
66 | /* Actual doing schema inferring */
67 | private def infer(sparkSession: SparkSession, inputPaths: Seq[FileStatus], options: ExcelOptions): StructType = {
68 | val excelHelper = ExcelHelper(options)
69 | val conf = sparkSession.sessionState.newHadoopConf()
70 |
71 | /* Sampling ratio on file level (not row level as in CSV) */
72 | val paths = {
73 | var sample = (inputPaths.size * options.samplingRatio).intValue
74 | sample = if (sample < 1) 1 else sample
75 | inputPaths.take(sample).map(_.getPath.toUri)
76 | }
77 | val (sheetData, colNames) = excelHelper.parseSheetData(conf, paths)
78 | try {
79 | if (sheetData.rowIterator.isEmpty) {
80 | /* If the first file is empty, not checking further */
81 | StructType(Seq.empty)
82 | } else {
83 | /* Ready to infer schema */
84 | ExcelInferSchema(options).infer(sheetData.rowIterator, colNames)
85 | }
86 | } finally {
87 | sheetData.close()
88 | }
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/main/3.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelWriteBuilder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.mapreduce.Job
20 | import org.apache.hadoop.mapreduce.TaskAttemptContext
21 | import org.apache.spark.sql.connector.write.LogicalWriteInfo
22 | import org.apache.spark.sql.execution.datasources.OutputWriter
23 | import org.apache.spark.sql.execution.datasources.OutputWriterFactory
24 | import org.apache.spark.sql.execution.datasources.v2.FileWrite
25 | import org.apache.spark.sql.internal.SQLConf
26 | import org.apache.spark.sql.types.DataType
27 | import org.apache.spark.sql.types.StructType
28 |
29 | case class ExcelWriteBuilder(
30 | paths: Seq[String],
31 | formatName: String,
32 | supportsDataType: DataType => Boolean,
33 | info: LogicalWriteInfo
34 | ) extends FileWrite {
35 | override def prepareWrite(
36 | sqlConf: SQLConf,
37 | job: Job,
38 | options: Map[String, String],
39 | dataSchema: StructType
40 | ): OutputWriterFactory = {
41 |
42 | val excelOptions = new ExcelOptions(options, sqlConf.sessionLocalTimeZone)
43 |
44 | new OutputWriterFactory {
45 | override def newInstance(path: String, dataSchema: StructType, context: TaskAttemptContext): OutputWriter = {
46 | new ExcelOutputWriter(path, dataSchema, context, excelOptions)
47 | }
48 |
49 | override def getFileExtension(context: TaskAttemptContext): String =
50 | s".${excelOptions.fileExtension}"
51 | }
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/main/3.3_and_up/scala/dev/mauch/spark/excel/v2/ExcelScan.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.sql.SparkSession
21 | import org.apache.spark.sql.catalyst.expressions.{ExprUtils, Expression}
22 | import org.apache.spark.sql.connector.read.PartitionReaderFactory
23 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
24 | import org.apache.spark.sql.execution.datasources.v2.TextBasedFileScan
25 | import org.apache.spark.sql.sources.Filter
26 | import org.apache.spark.sql.types.StructType
27 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
28 | import org.apache.spark.util.SerializableConfiguration
29 |
30 | import scala.collection.compat.immutable.ArraySeq
31 | import scala.jdk.CollectionConverters._
32 |
33 | case class ExcelScan(
34 | sparkSession: SparkSession,
35 | fileIndex: PartitioningAwareFileIndex,
36 | dataSchema: StructType,
37 | readDataSchema: StructType,
38 | readPartitionSchema: StructType,
39 | options: CaseInsensitiveStringMap,
40 | pushedFilters: Array[Filter],
41 | partitionFilters: Seq[Expression] = Seq.empty,
42 | dataFilters: Seq[Expression] = Seq.empty
43 | ) extends TextBasedFileScan(sparkSession, options) {
44 |
45 | private lazy val parsedOptions: ExcelOptions = new ExcelOptions(
46 | options.asScala.toMap,
47 | sparkSession.sessionState.conf.sessionLocalTimeZone,
48 | sparkSession.sessionState.conf.columnNameOfCorruptRecord
49 | )
50 |
51 | override def isSplitable(path: Path): Boolean = false
52 |
53 | override def getFileUnSplittableReason(path: Path): String = {
54 | "No practical method of splitting an excel file"
55 | }
56 |
57 | override def createReaderFactory(): PartitionReaderFactory = {
58 |
59 | /* Check a field requirement for corrupt records here to throw an exception in a driver side
60 | */
61 | ExprUtils.verifyColumnNameOfCorruptRecord(dataSchema, parsedOptions.columnNameOfCorruptRecord)
62 |
63 | if (
64 | readDataSchema.length == 1 &&
65 | readDataSchema.head.name == parsedOptions.columnNameOfCorruptRecord
66 | ) {
67 | throw new RuntimeException(
68 | "Queries from raw Excel files are disallowed when the referenced " +
69 | "columns only include the internal corrupt record column"
70 | )
71 | }
72 |
73 | val caseSensitiveMap = options.asCaseSensitiveMap.asScala.toMap
74 |
75 | /* Hadoop Configurations are case sensitive. */
76 | val hadoopConf = sparkSession.sessionState.newHadoopConfWithOptions(caseSensitiveMap)
77 |
78 | val broadcastedConf = sparkSession.sparkContext
79 | .broadcast(new SerializableConfiguration(hadoopConf))
80 |
81 | /* The partition values are already truncated in `FileScan.partitions`. We should use `readPartitionSchema` as the
82 | * partition schema here.
83 | */
84 | ExcelPartitionReaderFactory(
85 | sparkSession.sessionState.conf,
86 | broadcastedConf,
87 | dataSchema,
88 | readDataSchema,
89 | readPartitionSchema,
90 | parsedOptions,
91 | ArraySeq.unsafeWrapArray(pushedFilters)
92 | )
93 | }
94 |
95 | override def equals(obj: Any): Boolean = obj match {
96 | case c: ExcelScan =>
97 | super.equals(c) && dataSchema == c.dataSchema && options == c.options &&
98 | equivalentFilters(pushedFilters, c.pushedFilters)
99 | case _ => false
100 | }
101 |
102 | override def hashCode(): Int = super.hashCode()
103 |
104 | override def description(): String = {
105 | super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]")
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/src/main/3.3_and_up/scala/dev/mauch/spark/excel/v2/ExcelScanBuilder.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.SparkSession
20 | import org.apache.spark.sql.connector.read.Scan
21 | import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex
22 | import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder
23 | import org.apache.spark.sql.internal.connector.SupportsPushDownCatalystFilters
24 | import org.apache.spark.sql.types.StructType
25 | import org.apache.spark.sql.util.CaseInsensitiveStringMap
26 |
27 | case class ExcelScanBuilder(
28 | sparkSession: SparkSession,
29 | fileIndex: PartitioningAwareFileIndex,
30 | schema: StructType,
31 | dataSchema: StructType,
32 | options: CaseInsensitiveStringMap
33 | ) extends FileScanBuilder(sparkSession, fileIndex, dataSchema)
34 | with SupportsPushDownCatalystFilters {
35 |
36 | override def build(): Scan = {
37 | ExcelScan(sparkSession, fileIndex, dataSchema, readDataSchema(), readPartitionSchema(), options, pushedDataFilters)
38 | }
39 |
40 | }
41 |
--------------------------------------------------------------------------------
/src/main/3.4.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelParserBase.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import _root_.org.apache.spark.sql.catalyst.util.BadRecordException
20 | import org.apache.spark.unsafe.types.UTF8String
21 | import org.apache.spark.sql.catalyst.InternalRow
22 |
23 | trait ExcelParserBase {
24 |
25 | protected def getCurrentInput: UTF8String
26 | def badRecord(partialResults: Array[InternalRow], baseException: Throwable): BadRecordException =
27 | BadRecordException(() => getCurrentInput, () => partialResults, baseException)
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/3.4.2_and_up/scala/dev/mauch/spark/excel/v2/ExcelPartitionReaderFactory.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.hadoop.conf.Configuration
20 | import org.apache.spark.broadcast.Broadcast
21 | import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow}
22 | import org.apache.spark.sql.connector.read.PartitionReader
23 | import org.apache.spark.sql.execution.datasources.PartitionedFile
24 | import org.apache.spark.sql.execution.datasources.v2._
25 | import org.apache.spark.sql.internal.SQLConf
26 | import org.apache.spark.sql.sources.Filter
27 | import org.apache.spark.sql.types.StructType
28 | import org.apache.spark.util.SerializableConfiguration
29 |
30 | import java.net.URI
31 | import scala.util.control.NonFatal
32 |
33 | /** A factory used to create Excel readers.
34 | *
35 | * @param sqlConf
36 | * SQL configuration.
37 | * @param broadcastedConf
38 | * Broadcasted serializable Hadoop Configuration.
39 | * @param dataSchema
40 | * Schema of Excel files.
41 | * @param readDataSchema
42 | * Required data schema in the batch scan.
43 | * @param partitionSchema
44 | * Schema of partitions.
45 | * @param parsedOptions
46 | * Options for parsing Excel files.
47 | */
48 | case class ExcelPartitionReaderFactory(
49 | sqlConf: SQLConf,
50 | broadcastedConf: Broadcast[SerializableConfiguration],
51 | dataSchema: StructType,
52 | readDataSchema: StructType,
53 | partitionSchema: StructType,
54 | parsedOptions: ExcelOptions,
55 | filters: Seq[Filter]
56 | ) extends FilePartitionReaderFactory {
57 | protected def options: FileSourceOptions = new FileSourceOptions(
58 | Map(FileSourceOptions.IGNORE_CORRUPT_FILES -> "true", FileSourceOptions.IGNORE_MISSING_FILES -> "true")
59 | )
60 | override def buildReader(file: PartitionedFile): PartitionReader[InternalRow] = {
61 | val conf = broadcastedConf.value.value
62 | val actualDataSchema =
63 | StructType(dataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
64 | val actualReadDataSchema =
65 | StructType(readDataSchema.filterNot(_.name == parsedOptions.columnNameOfCorruptRecord))
66 | val parser = new ExcelParser(actualDataSchema, actualReadDataSchema, parsedOptions, filters)
67 | val headerChecker =
68 | new ExcelHeaderChecker(actualReadDataSchema, parsedOptions, source = s"Excel file: ${file.filePath}")
69 | val iter = readFile(conf, file, parser, headerChecker, readDataSchema)
70 | val partitionReader = new SparkExcelPartitionReaderFromIterator(iter)
71 | new PartitionReaderWithPartitionValues(partitionReader, readDataSchema, partitionSchema, file.partitionValues)
72 | }
73 |
74 | private def readFile(
75 | conf: Configuration,
76 | file: PartitionedFile,
77 | parser: ExcelParser,
78 | headerChecker: ExcelHeaderChecker,
79 | requiredSchema: StructType
80 | ): SheetData[InternalRow] = {
81 | val excelHelper = ExcelHelper(parsedOptions)
82 | val sheetData = excelHelper.getSheetData(conf, URI.create(file.filePath.toString))
83 | try {
84 | SheetData(
85 | ExcelParser.parseIterator(sheetData.rowIterator, parser, headerChecker, requiredSchema),
86 | sheetData.resourcesToClose
87 | )
88 | } catch {
89 | case NonFatal(t) => {
90 | sheetData.close()
91 | throw t
92 | }
93 | }
94 | }
95 |
96 | }
97 |
98 | private class SparkExcelPartitionReaderFromIterator(sheetData: SheetData[InternalRow])
99 | extends PartitionReaderFromIterator[InternalRow](sheetData.rowIterator) {
100 | override def close(): Unit = {
101 | super.close()
102 | sheetData.close()
103 | }
104 | }
105 |
--------------------------------------------------------------------------------
/src/main/3.4_and_up/scala/dev/mauch/spark/excel/v2/ExcelOptions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.catalyst.FileSourceOptions
20 | import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
21 | import org.apache.spark.sql.internal.SQLConf
22 |
23 | class ExcelOptions(
24 | @transient
25 | val parameters: CaseInsensitiveMap[String],
26 | val defaultTimeZoneId: String,
27 | val defaultColumnNameOfCorruptRecord: String
28 | ) extends FileSourceOptions(parameters)
29 | with ExcelOptionsTrait {
30 |
31 | def this(parameters: Map[String, String], defaultTimeZoneId: String) = {
32 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, SQLConf.get.columnNameOfCorruptRecord)
33 | }
34 |
35 | def this(parameters: Map[String, String], defaultTimeZoneId: String, defaultColumnNameOfCorruptRecord: String) = {
36 | this(CaseInsensitiveMap(parameters), defaultTimeZoneId, defaultColumnNameOfCorruptRecord)
37 | }
38 |
39 | }
40 |
--------------------------------------------------------------------------------
/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister:
--------------------------------------------------------------------------------
1 | dev.mauch.spark.excel.v2.ExcelDataSource
2 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/DefaultSource.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.hadoop.fs.Path
20 | import org.apache.spark.sql.sources._
21 | import org.apache.spark.sql.types.StructType
22 | import org.apache.spark.sql.{DataFrame, SQLContext, SaveMode}
23 |
24 | class DefaultSource extends RelationProvider with SchemaRelationProvider with CreatableRelationProvider {
25 |
26 | /** Creates a new relation for retrieving data from an Excel file
27 | */
28 | override def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): ExcelRelation =
29 | createRelation(sqlContext, parameters, null)
30 |
31 | /** Creates a new relation for retrieving data from an Excel file
32 | */
33 | override def createRelation(
34 | sqlContext: SQLContext,
35 | parameters: Map[String, String],
36 | schema: StructType
37 | ): ExcelRelation = {
38 | val conf = sqlContext.sparkSession.sessionState.newHadoopConf()
39 | val wbReader = WorkbookReader(parameters, conf)
40 | val dataLocator = DataLocator(parameters)
41 | ExcelRelation(
42 | header = checkParameter(parameters, "header").toBoolean,
43 | treatEmptyValuesAsNulls = parameters.get("treatEmptyValuesAsNulls").fold(false)(_.toBoolean),
44 | setErrorCellsToFallbackValues = parameters.get("setErrorCellsToFallbackValues").fold(false)(_.toBoolean),
45 | usePlainNumberFormat = parameters.get("usePlainNumberFormat").fold(false)(_.toBoolean),
46 | userSchema = Option(schema),
47 | inferSheetSchema = parameters.get("inferSchema").fold(false)(_.toBoolean),
48 | addColorColumns = parameters.get("addColorColumns").fold(false)(_.toBoolean),
49 | timestampFormat = parameters.get("timestampFormat"),
50 | dateFormat = parameters.get("dateFormat"),
51 | excerptSize = parameters.get("excerptSize").fold(10)(_.toInt),
52 | dataLocator = dataLocator,
53 | workbookReader = wbReader
54 | )(sqlContext)
55 | }
56 |
57 | override def createRelation(
58 | sqlContext: SQLContext,
59 | mode: SaveMode,
60 | parameters: Map[String, String],
61 | data: DataFrame
62 | ): BaseRelation = {
63 | val path = checkParameter(parameters, "path")
64 | val header = checkParameter(parameters, "header").toBoolean
65 | val filesystemPath = new Path(path)
66 | val conf = sqlContext.sparkSession.sessionState.newHadoopConf()
67 | val fs = filesystemPath.getFileSystem(conf)
68 | new ExcelFileSaver(
69 | fs,
70 | filesystemPath,
71 | data,
72 | saveMode = mode,
73 | header = header,
74 | dataLocator = DataLocator(parameters)
75 | ).save()
76 |
77 | createRelation(sqlContext, parameters, data.schema)
78 | }
79 |
80 | // Forces a Parameter to exist, otherwise an exception is thrown.
81 | private def checkParameter(map: Map[String, String], param: String): String = {
82 | if (!map.contains(param)) {
83 | throw new IllegalArgumentException(s"Parameter ${'"'}$param${'"'} is missing in options.")
84 | } else {
85 | map.apply(param)
86 | }
87 | }
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/DefaultSource15.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.spark.sql.sources.DataSourceRegister
20 |
21 | class DefaultSource15 extends DefaultSource with DataSourceRegister {
22 | override def shortName(): String = "excel"
23 | }
24 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/ExcelFileSaver.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.hadoop.fs.{FSDataInputStream, FileSystem, Path}
20 | import org.apache.poi.xssf.usermodel.XSSFWorkbook
21 | import org.apache.spark.sql.{DataFrame, SaveMode}
22 | import spoiwo.model._
23 | import spoiwo.natures.xlsx.Model2XlsxConversions._
24 |
25 | import java.io.BufferedOutputStream
26 | import scala.jdk.CollectionConverters._
27 |
28 | object ExcelFileSaver {
29 | final val DEFAULT_SHEET_NAME = "Sheet1"
30 | final val DEFAULT_DATE_FORMAT = "yy-m-d h:mm"
31 | final val DEFAULT_TIMESTAMP_FORMAT = "yyyy-mm-dd hh:mm:ss.000"
32 | }
33 |
34 | class ExcelFileSaver(
35 | fs: FileSystem,
36 | location: Path,
37 | dataFrame: DataFrame,
38 | saveMode: SaveMode,
39 | dataLocator: DataLocator,
40 | header: Boolean = true
41 | ) {
42 | def save(): Unit = {
43 | def sheet(workbook: XSSFWorkbook) = {
44 | val headerRow = if (header) Some(dataFrame.schema.fields.map(_.name).toSeq) else None
45 | val dataRows = dataFrame
46 | .toLocalIterator()
47 | .asScala
48 | .map(_.toSeq)
49 | dataLocator.toSheet(headerRow, dataRows, workbook)
50 | }
51 | val fileAlreadyExists = fs.exists(location)
52 | def writeToWorkbook(workbook: XSSFWorkbook): Unit = {
53 | Workbook(sheet(workbook)).writeToExisting(workbook)
54 | autoClose(new BufferedOutputStream(fs.create(location)))(workbook.write)
55 | }
56 | (fileAlreadyExists, saveMode) match {
57 | case (false, _) | (_, SaveMode.Overwrite) =>
58 | if (fileAlreadyExists) {
59 | fs.delete(location, true)
60 | }
61 | writeToWorkbook(new XSSFWorkbook())
62 | case (true, SaveMode.ErrorIfExists) =>
63 | sys.error(s"path $location already exists.")
64 | case (true, SaveMode.Ignore) => ()
65 | case (true, SaveMode.Append) =>
66 | val inputStream: FSDataInputStream = fs.open(location)
67 | val workbook = new XSSFWorkbook(inputStream)
68 | inputStream.close()
69 | writeToWorkbook(workbook)
70 | }
71 | }
72 |
73 | def autoClose[A <: AutoCloseable, B](closeable: A)(fun: (A) => B): B = {
74 | try {
75 | fun(closeable)
76 | } finally {
77 | closeable.close()
78 | }
79 | }
80 | }
81 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/InferSchema.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.spark.rdd.RDD
20 | import org.apache.spark.sql.types._
21 |
22 | private[excel] object InferSchema {
23 | type CellType = Int
24 |
25 | /** Similar to the JSON schema inference. [[org.apache.spark.sql.execution.datasources.json.InferSchema]]
26 | * 1. Infer type of each row 2. Merge row types to find common type 3. Replace any null types with string type
27 | */
28 | def apply(rowsRDD: RDD[Seq[DataType]]): Array[DataType] = {
29 | val startType: Array[DataType] = Array.empty
30 | val rootTypes: Array[DataType] = rowsRDD.aggregate(startType)(inferRowType, mergeRowTypes)
31 |
32 | rootTypes.map {
33 | case _: NullType => StringType
34 | case other => other
35 | }
36 | }
37 |
38 | private def inferRowType(rowSoFar: Array[DataType], next: Seq[DataType]): Array[DataType] = {
39 | val maxLength = math.max(rowSoFar.length, next.size)
40 | val defaultDataType: Int => DataType = (_ => NullType)
41 | val filledRowSoFar = Array.tabulate(maxLength)(n => rowSoFar.applyOrElse[Int, DataType](n, defaultDataType))
42 | val filledNext = Array.tabulate(maxLength)(n => next.applyOrElse[Int, DataType](n, defaultDataType))
43 | filledRowSoFar.zip(filledNext).map { case (r, n) => inferField(r, n) }
44 | }
45 |
46 | private[excel] def mergeRowTypes(first: Array[DataType], second: Array[DataType]): Array[DataType] = {
47 | first.zipAll(second, NullType, NullType).map { case ((a, b)) =>
48 | findTightestCommonType(a, b).getOrElse(NullType)
49 | }
50 | }
51 |
52 | /** Infer type of string field. Given known type Double, and a string "1", there is no point checking if it is an Int,
53 | * as the final type must be Double or higher.
54 | */
55 | private[excel] def inferField(typeSoFar: DataType, field: DataType): DataType = {
56 | // Defining a function to return the StringType constant is necessary in order to work around
57 | // a Scala compiler issue which leads to runtime incompatibilities with certain Spark versions;
58 | // see issue #128 for more details.
59 | def stringType(): DataType = {
60 | StringType
61 | }
62 |
63 | if (field == NullType) {
64 | typeSoFar
65 | } else {
66 | (typeSoFar, field) match {
67 | case (NullType, ct) => ct
68 | case (DoubleType, DoubleType) => DoubleType
69 | case (BooleanType, BooleanType) => BooleanType
70 | case (TimestampType, TimestampType) => TimestampType
71 | case (StringType, _) => stringType()
72 | case (_, _) => stringType()
73 | }
74 | }
75 | }
76 |
77 | /** Copied from internal Spark api [[org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion]]
78 | */
79 | private val numericPrecedence: IndexedSeq[DataType] =
80 | IndexedSeq[DataType](ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, TimestampType)
81 |
82 | /** Copied from internal Spark api [[org.apache.spark.sql.catalyst.analysis.HiveTypeCoercion]]
83 | */
84 | val findTightestCommonType: (DataType, DataType) => Option[DataType] = {
85 | case (t1, t2) if t1 == t2 => Some(t1)
86 | case (NullType, t1) => Some(t1)
87 | case (t1, NullType) => Some(t1)
88 | case (StringType, _) => Some(StringType)
89 | case (_, StringType) => Some(StringType)
90 |
91 | // Promote numeric types to the highest of the two and all numeric types to unlimited decimal
92 | case (t1, t2) if Seq(t1, t2).forall(numericPrecedence.contains) =>
93 | val index = numericPrecedence.lastIndexWhere(t => t == t1 || t == t2)
94 | Some(numericPrecedence(index))
95 |
96 | case _ => None
97 | }
98 | }
99 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/PlainNumberFormat.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import java.math.BigDecimal
20 | import java.text.FieldPosition
21 | import java.text.Format
22 | import java.text.ParsePosition
23 |
24 | /** A format that formats a double as a plain string without rounding and scientific notation. All other operations are
25 | * unsupported.
26 | * @see
27 | * [[org.apache.poi.ss.usermodel.ExcelGeneralNumberFormat]] and SSNFormat from
28 | * [[org.apache.poi.ss.usermodel.DataFormatter]] from Apache POI.
29 | */
30 | object PlainNumberFormat extends Format {
31 |
32 | override def format(number: AnyRef, toAppendTo: StringBuffer, pos: FieldPosition): StringBuffer = {
33 | // Convert to BigDecimal for formatting
34 | val bd = new BigDecimal(number.toString)
35 | // Check if the number is an integer (scale == 0 after stripping trailing zeros)
36 | val stripped = bd.stripTrailingZeros()
37 | if (stripped.scale() <= 0) {
38 | // It's an integer, format without decimal point
39 | toAppendTo.append(stripped.toBigInteger().toString())
40 | } else {
41 | // It's not an integer, format as plain string
42 | toAppendTo.append(bd.toPlainString)
43 | }
44 | }
45 |
46 | override def parseObject(source: String, pos: ParsePosition): AnyRef =
47 | throw new UnsupportedOperationException()
48 | }
49 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/Utils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 | import scala.util.{Success, Try}
19 |
20 | object Utils {
21 | implicit class RichTry[T](t: Try[T]) {
22 | def toEither: Either[Throwable, T] = t.transform(s => Success(Right(s)), f => Success(Left(f))).get
23 | }
24 |
25 | case class MapIncluding[K](keys: Seq[K], optionally: Seq[K] = Seq()) {
26 | def unapply[V](m: Map[K, V]): Option[(Seq[V], Seq[Option[V]])] =
27 | if (keys.forall(m.contains)) {
28 | Some((keys.map(m), optionally.map(m.get)))
29 | } else {
30 | None
31 | }
32 | }
33 | sealed trait MapRequirements[K] {
34 | type ResultType[V]
35 | def unapplySeq[V](m: Map[K, V]): Option[ResultType[V]]
36 | }
37 | case class RequiredKeys[K](keys: K*) extends MapRequirements[K] {
38 | type ResultType[V] = Seq[V]
39 | def unapplySeq[V](m: Map[K, V]): Option[Seq[V]] =
40 | if (keys.forall(m.contains)) {
41 | Some(keys.map(m))
42 | } else {
43 | None
44 | }
45 | }
46 | case class OptionalKeys[K](keys: K*) extends MapRequirements[K] {
47 | type ResultType[V] = Seq[Option[V]]
48 | def unapplySeq[V](m: Map[K, V]): Option[Seq[Option[V]]] = Some(keys.map(m.get))
49 | }
50 | case class MapWith[K](
51 | requiredKeys: RequiredKeys[K] = RequiredKeys[K](),
52 | optionalKeys: OptionalKeys[K] = OptionalKeys[K]()
53 | ) {
54 | def unapply[V](m: Map[K, V]): Option[(requiredKeys.ResultType[V], optionalKeys.ResultType[V])] =
55 | for {
56 | req <- requiredKeys.unapplySeq(m)
57 | opt <- optionalKeys.unapplySeq(m)
58 | } yield (req, opt)
59 | }
60 | }
61 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/WorkbookReader.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import java.io.InputStream
20 | import dev.mauch.spark.excel.Utils.MapIncluding
21 | import com.github.pjfanning.xlsx.StreamingReader
22 | import org.apache.hadoop.conf.Configuration
23 | import org.apache.hadoop.fs.{FileSystem, Path}
24 | import org.apache.poi.ss.usermodel.{Workbook, WorkbookFactory}
25 | import org.apache.poi.hssf.usermodel.HSSFWorkbookFactory
26 | import org.apache.poi.openxml4j.util.ZipInputStreamZipEntrySource
27 | import org.apache.poi.util.IOUtils
28 | import org.apache.poi.xssf.usermodel.XSSFWorkbookFactory
29 | import scala.collection.JavaConverters.mapAsScalaMapConverter
30 |
31 | trait WorkbookReader {
32 | protected def openWorkbook(): Workbook
33 | def withWorkbook[T](f: Workbook => T): T = {
34 | val workbook = openWorkbook()
35 | val res = f(workbook)
36 | workbook.close()
37 | res
38 | }
39 | def sheetNames: Seq[String] = {
40 | withWorkbook(workbook =>
41 | for (sheetIx <- (0 until workbook.getNumberOfSheets())) yield {
42 | workbook.getSheetAt(sheetIx).getSheetName()
43 | }
44 | )
45 | }
46 | }
47 |
48 | object WorkbookReader {
49 | val WithLocationMaxRowsInMemoryAndPassword =
50 | MapIncluding(
51 | Seq("path"),
52 | optionally = Seq("maxRowsInMemory", "workbookPassword", "maxByteArraySize", "tempFileThreshold")
53 | )
54 |
55 | WorkbookFactory.addProvider(new HSSFWorkbookFactory)
56 | WorkbookFactory.addProvider(new XSSFWorkbookFactory)
57 |
58 | def apply(parameters: java.util.HashMap[String, String], hadoopConfiguration: Configuration): WorkbookReader = {
59 | apply(parameters.asScala.toMap, hadoopConfiguration)
60 | }
61 |
62 | def apply(parameters: Map[String, String], hadoopConfiguration: Configuration): WorkbookReader = {
63 | def readFromHadoop(location: String) = {
64 | val path = new Path(location)
65 | FileSystem.get(path.toUri, hadoopConfiguration).open(path)
66 | }
67 | parameters match {
68 | case WithLocationMaxRowsInMemoryAndPassword(
69 | Seq(location),
70 | Seq(Some(maxRowsInMemory), passwordOption, maxByteArraySizeOption, tempFileThreshold)
71 | ) =>
72 | new StreamingWorkbookReader(
73 | readFromHadoop(location),
74 | passwordOption,
75 | maxRowsInMemory.toInt,
76 | maxByteArraySizeOption.map(_.toInt),
77 | tempFileThreshold.map(_.toInt)
78 | )
79 | case WithLocationMaxRowsInMemoryAndPassword(
80 | Seq(location),
81 | Seq(None, passwordOption, maxByteArraySizeOption, tempFileThresholdOption)
82 | ) =>
83 | new DefaultWorkbookReader(
84 | readFromHadoop(location),
85 | passwordOption,
86 | maxByteArraySizeOption.map(_.toInt),
87 | tempFileThresholdOption.map(_.toInt)
88 | )
89 | }
90 | }
91 | }
92 | class DefaultWorkbookReader(
93 | inputStreamProvider: => InputStream,
94 | workbookPassword: Option[String],
95 | maxByteArraySize: Option[Int],
96 | tempFileThreshold: Option[Int]
97 | ) extends WorkbookReader {
98 |
99 | protected def openWorkbook(): Workbook = {
100 | maxByteArraySize.foreach { maxSize =>
101 | IOUtils.setByteArrayMaxOverride(maxSize)
102 | }
103 | tempFileThreshold.foreach { threshold =>
104 | ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(threshold)
105 | }
106 | workbookPassword
107 | .fold(WorkbookFactory.create(inputStreamProvider))(password =>
108 | WorkbookFactory.create(inputStreamProvider, password)
109 | )
110 | }
111 | }
112 |
113 | class StreamingWorkbookReader(
114 | inputStreamProvider: => InputStream,
115 | workbookPassword: Option[String],
116 | maxRowsInMem: Int,
117 | maxByteArraySize: Option[Int],
118 | tempFileThreshold: Option[Int]
119 | ) extends WorkbookReader {
120 | override protected def openWorkbook(): Workbook = {
121 | maxByteArraySize.foreach { maxSize =>
122 | IOUtils.setByteArrayMaxOverride(maxSize)
123 | }
124 | tempFileThreshold.foreach { threshold =>
125 | ZipInputStreamZipEntrySource.setThresholdBytesForTempFiles(threshold)
126 | }
127 | val builder = StreamingReader
128 | .builder()
129 | .rowCacheSize(maxRowsInMem)
130 | .bufferSize(4096)
131 | workbookPassword
132 | .fold(builder)(password => builder.password(password))
133 | .open(inputStreamProvider)
134 | }
135 | }
136 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark
18 |
19 | import org.apache.poi.ss.usermodel.Row.MissingCellPolicy
20 | import org.apache.poi.ss.usermodel.{Cell, CellType, Row}
21 | import org.apache.spark.sql.{DataFrameReader, DataFrameWriter}
22 | import spoiwo.model.Sheet
23 |
24 | package object excel {
25 | implicit class RichRow(val row: Row) extends AnyVal {
26 | def eachCellIterator(startColumn: Int, endColumn: Int): Iterator[Option[Cell]] =
27 | new Iterator[Option[Cell]] {
28 | private val lastCellInclusive = row.getLastCellNum - 1
29 | private val endCol = Math.min(endColumn, Math.max(startColumn, lastCellInclusive))
30 | require(startColumn >= 0 && startColumn <= endCol)
31 |
32 | private var nextCol = startColumn
33 |
34 | override def hasNext: Boolean = nextCol <= endCol && nextCol <= lastCellInclusive
35 |
36 | override def next(): Option[Cell] = {
37 | val next =
38 | if (nextCol > endCol) throw new NoSuchElementException(s"column index = $nextCol")
39 | else Option(row.getCell(nextCol, MissingCellPolicy.RETURN_NULL_AND_BLANK))
40 | nextCol += 1
41 | next
42 | }
43 | }
44 | }
45 |
46 | implicit class RichCell(val cell: Cell) extends AnyVal {
47 | def value: Any =
48 | cell.getCellType match {
49 | case CellType.BLANK | CellType.ERROR | CellType._NONE => null
50 | case CellType.NUMERIC => cell.getNumericCellValue
51 | case CellType.STRING => cell.getStringCellValue
52 | case CellType.BOOLEAN => cell.getBooleanCellValue
53 | case CellType.FORMULA =>
54 | cell.getCachedFormulaResultType match {
55 | case CellType.BLANK => null
56 | case CellType.NUMERIC => cell.getNumericCellValue
57 | case CellType.STRING => cell.getRichStringCellValue
58 | case CellType.BOOLEAN => cell.getBooleanCellValue
59 | case _ => null
60 | }
61 | }
62 | }
63 |
64 | implicit class RichSpoiwoSheet(val sheet: Sheet) extends AnyVal {
65 | def extractTableData(tableNumber: Int): Seq[Seq[Any]] = {
66 | val table = sheet.tables(tableNumber)
67 | val (startRow, endRow) = table.cellRange.rowRange
68 | val (startColumn, endColumn) = table.cellRange.columnRange
69 | val tableRows = sheet.rows.filter(r => r.index.exists((startRow to endRow).contains))
70 | tableRows.map(_.cells.filter(_.index.exists((startColumn to endColumn).contains)).map(_.value).toSeq)
71 | }
72 | }
73 |
74 | implicit class ExcelDataFrameReader(val dataFrameReader: DataFrameReader) extends AnyVal {
75 | def excel(
76 | header: Boolean = true,
77 | treatEmptyValuesAsNulls: Boolean = false,
78 | setErrorCellsToFallbackValues: Boolean = false,
79 | inferSchema: Boolean = false,
80 | usePlainNumberFormat: Boolean = false,
81 | addColorColumns: Boolean = false,
82 | dataAddress: String = null,
83 | timestampFormat: String = null,
84 | maxRowsInMemory: java.lang.Integer = null,
85 | maxByteArraySize: java.lang.Integer = null,
86 | tempFileThreshold: java.lang.Integer = null,
87 | excerptSize: Int = 10,
88 | workbookPassword: String = null
89 | ): DataFrameReader = {
90 | Map(
91 | "header" -> header,
92 | "treatEmptyValuesAsNulls" -> treatEmptyValuesAsNulls,
93 | "setErrorCellsToFallbackValues" -> setErrorCellsToFallbackValues,
94 | "usePlainNumberFormat" -> usePlainNumberFormat,
95 | "inferSchema" -> inferSchema,
96 | "addColorColumns" -> addColorColumns,
97 | "dataAddress" -> dataAddress,
98 | "timestampFormat" -> timestampFormat,
99 | "maxRowsInMemory" -> maxRowsInMemory,
100 | "maxByteArraySize" -> maxByteArraySize,
101 | "tempFileThreshold" -> tempFileThreshold,
102 | "excerptSize" -> excerptSize,
103 | "workbookPassword" -> workbookPassword
104 | ).foldLeft(dataFrameReader.format("dev.mauch.spark.excel")) { case (dfReader, (key, value)) =>
105 | value match {
106 | case null => dfReader
107 | case v => dfReader.option(key, v.toString)
108 | }
109 | }
110 | }
111 | }
112 |
113 | implicit class ExcelDataFrameWriter[T](val dataFrameWriter: DataFrameWriter[T]) extends AnyVal {
114 | def excel(
115 | header: Boolean = true,
116 | dataAddress: String = null,
117 | preHeader: String = null,
118 | dateFormat: String = null,
119 | timestampFormat: String = null,
120 | workbookPassword: String = null
121 | ): DataFrameWriter[T] = {
122 | Map(
123 | "header" -> header,
124 | "dataAddress" -> dataAddress,
125 | "dateFormat" -> dateFormat,
126 | "timestampFormat" -> timestampFormat,
127 | "workbookPassword" -> workbookPassword,
128 | "preHeader" -> preHeader
129 | ).foldLeft(dataFrameWriter.format("dev.mauch.spark.excel")) { case (dfWriter, (key, value)) =>
130 | value match {
131 | case null => dfWriter
132 | case v => dfWriter.option(key, v.toString)
133 | }
134 | }
135 | }
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/v2/ExcelHeaderChecker.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.internal.Logging
20 | import org.apache.spark.sql.internal.SQLConf
21 | import org.apache.spark.sql.types.StructType
22 |
23 | /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case
24 | * sensitivity.
25 | *
26 | * @param schema
27 | * provided (or inferred) schema to which Excel must conform.
28 | * @param options
29 | * parsed Excel options.
30 | * @param source
31 | * name of Excel source that are currently checked. It is used in error messages.
32 | */
33 | class ExcelHeaderChecker(schema: StructType, options: ExcelOptions, source: String) extends Logging {
34 |
35 | /** Indicates if it is set to `false`, comparison of column names and schema field names is not case sensitive.
36 | */
37 | private val caseSensitive = SQLConf.get.caseSensitiveAnalysis
38 |
39 | /** Indicates if it is `true`, column names are ignored otherwise the Excel column names are checked for conformance
40 | * to the schema. In the case if the column name don't conform to the schema, an exception is thrown.
41 | */
42 | private val enforceSchema = options.enforceSchema
43 |
44 | /** Checks that column names in a Excel header and field names in the schema are the same by taking into account case
45 | * sensitivity.
46 | *
47 | * @param columnNames
48 | * names of Excel columns that must be checked against to the schema.
49 | */
50 | def checkHeaderColumnNames(columnNames: Vector[String]): Unit = {
51 | if (columnNames != null) {
52 | val fieldNames = schema.map(_.name).toIndexedSeq
53 | val (headerLen, schemaSize) = (columnNames.size, fieldNames.length)
54 | var errorMessage: Option[String] = None
55 |
56 | if (headerLen == schemaSize) {
57 | var i = 0
58 | while (errorMessage.isEmpty && i < headerLen) {
59 | var (nameInSchema, nameInHeader) = (fieldNames(i), columnNames(i))
60 | if (!caseSensitive) {
61 | // scalastyle:off caselocale
62 | nameInSchema = nameInSchema.toLowerCase
63 | nameInHeader = nameInHeader.toLowerCase
64 | // scalastyle:on caselocale
65 | }
66 | if (nameInHeader != nameInSchema) {
67 | errorMessage = Some(s"""|Excel header does not conform to the schema.
68 | | Header: ${columnNames.mkString(", ")}
69 | | Schema: ${fieldNames.mkString(", ")}
70 | |Expected: ${fieldNames(i)} but found: ${columnNames(i)}
71 | |$source""".stripMargin)
72 | }
73 | i += 1
74 | }
75 | } else {
76 | errorMessage = Some(s"""|Number of column in Excel header is not equal to number of fields in the schema:
77 | | Header length: $headerLen, schema size: $schemaSize
78 | |$source""".stripMargin)
79 | }
80 |
81 | errorMessage.foreach { msg =>
82 | if (enforceSchema) { logWarning(msg) }
83 | else { throw new IllegalArgumentException(msg) }
84 | }
85 | }
86 | }
87 |
88 | }
89 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/v2/ExcelOptionsTrait.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql.catalyst.util.{
20 | CaseInsensitiveMap,
21 | DateFormatter,
22 | DateTimeUtils,
23 | ParseMode,
24 | PermissiveMode,
25 | TimestampFormatter
26 | }
27 |
28 | import java.time.ZoneId
29 | import java.util.Locale
30 | import scala.annotation.nowarn
31 |
32 | trait ExcelOptionsTrait extends Serializable {
33 |
34 | val parameters: CaseInsensitiveMap[String]
35 | val defaultTimeZoneId: String
36 | val defaultColumnNameOfCorruptRecord: String
37 |
38 | private def getInt(paramName: String): Option[Int] = {
39 | val paramValue = parameters.get(paramName)
40 | paramValue match {
41 | case None => None
42 | case Some(null) => None
43 | case Some(value) =>
44 | try {
45 | Some(value.toInt)
46 | } catch {
47 | case _: NumberFormatException =>
48 | throw new RuntimeException(s"$paramName should be an integer. Found $value")
49 | }
50 | }
51 | }
52 |
53 | private def getBool(paramName: String, default: Boolean): Boolean = {
54 | val param = parameters.getOrElse(paramName, default.toString)
55 | if (param == null) {
56 | default
57 | } else if (param.toLowerCase(Locale.ROOT) == "true") {
58 | true
59 | } else if (param.toLowerCase(Locale.ROOT) == "false") {
60 | false
61 | } else {
62 | throw new Exception(s"$paramName flag can be true or false")
63 | }
64 | }
65 |
66 | /* Parsing mode, how to handle corrupted record. Default to permissive */
67 | val parseMode: ParseMode = parameters
68 | .get("mode")
69 | .map(ParseMode.fromString)
70 | .getOrElse(PermissiveMode)
71 |
72 | val zoneId: ZoneId = ZoneId
73 | .of(parameters.getOrElse(DateTimeUtils.TIMEZONE_OPTION, defaultTimeZoneId))
74 |
75 | /* A language tag in IETF BCP 47 format */
76 | val locale: Locale = parameters.get("locale").map(Locale.forLanguageTag).getOrElse(Locale.US)
77 |
78 | val dateFormat: String = parameters.getOrElse("dateFormat", DateFormatter.defaultPattern)
79 |
80 | @nowarn
81 | val timestampFormat: String = parameters.getOrElse("timestampFormat", TimestampFormatter.defaultPattern)
82 |
83 | /* Have header line when reading and writing */
84 | val header = getBool("header", default = true)
85 |
86 | /* Number of rows to ignore after header. Only in reading */
87 | val ignoreAfterHeader = getInt("ignoreAfterHeader").getOrElse(0)
88 |
89 | val inferSchema = getBool("inferSchema", default = false)
90 | val excerptSize = getInt("excerptSize")
91 |
92 | /** Forcibly apply the specified or inferred schema to data files. If the option is enabled, headers of ABC files will
93 | * be ignored.
94 | */
95 | val enforceSchema = getBool("enforceSchema", default = true)
96 |
97 | /* Name for column of corrupted records */
98 | val columnNameOfCorruptRecord = parameters
99 | .getOrElse("columnNameOfCorruptRecord", defaultColumnNameOfCorruptRecord)
100 |
101 | val nullValue = parameters.getOrElse("nullValue", "")
102 | val nanValue = parameters.getOrElse("nanValue", "NaN")
103 | val positiveInf = parameters.getOrElse("positiveInf", "Inf")
104 | val negativeInf = parameters.getOrElse("negativeInf", "-Inf")
105 |
106 | /* If true, format the cells without rounding and scientific notations */
107 | val usePlainNumberFormat = getBool("usePlainNumberFormat", default = false)
108 |
109 | /* If true, keep undefined (Excel) rows */
110 | val keepUndefinedRows = getBool("keepUndefinedRows", default = false)
111 |
112 | /* Use null value for error cells */
113 | val useNullForErrorCells = getBool("useNullForErrorCells", default = false)
114 |
115 | /* Additional column for color */
116 | val addColorColumns = getBool("addColorColumns", default = false)
117 | val ignoreLeadingWhiteSpace = getBool("ignoreLeadingWhiteSpace", default = false)
118 | val ignoreTrailingWhiteSpace = getBool("ignoreTrailingWhiteSpace", default = false)
119 |
120 | /* Additional column for excel row number */
121 | val columnNameOfRowNumber = parameters.get("columnNameOfRowNumber")
122 |
123 | /* Data address, default to everything */
124 | val dataAddress = parameters.getOrElse("dataAddress", "A1")
125 |
126 | /* Workbook password, optional */
127 | val workbookPassword = parameters.get("workbookPassword")
128 |
129 | /* Output excel file extension, default to xlsx */
130 | val fileExtension = parameters.get("fileExtension") match {
131 | case Some(value) => value.trim
132 | case None => "xlsx"
133 | }
134 |
135 | /* Defines fraction of file used for schema inferring. For default and
136 | invalid values, 1.0 will be used */
137 | val samplingRatio = {
138 | val r = parameters.get("samplingRatio").map(_.toDouble).getOrElse(1.0)
139 | if (r > 1.0 || r <= 0.0) 1.0 else r
140 | }
141 |
142 | /** Optional parameter for using a streaming reader which can help with big files (will fail if used with xls format
143 | * files)
144 | */
145 | val maxRowsInMemory = getInt("maxRowsInMemory")
146 |
147 | // scalastyle:off
148 | /** Optional parameter for maxByteArraySize
150 | */
151 | val maxByteArraySize = getInt("maxByteArraySize")
152 |
153 | // scalastyle:on
154 | /** Optional parameter for specifying the number of bytes at which a zip entry is regarded as too large for holding in
155 | * memory and the data is put in a temp file instead - useful for sheets with a lot of data
156 | */
157 | val tempFileThreshold = getInt("tempFileThreshold")
158 |
159 | // scalastyle:on
160 | /** Optional parameter to specify whether the sheet name in dataAddress is a regex (for loading multiple sheets at
161 | * once) or the true sheet name
162 | */
163 | val sheetNameIsRegex = getBool("sheetNameIsRegex", false)
164 | }
165 |
--------------------------------------------------------------------------------
/src/main/scala/dev/mauch/spark/excel/v2/SheetData.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import java.io.Closeable
20 |
21 | case class SheetData[T](rowIterator: Iterator[T], resourcesToClose: Seq[Closeable] = Seq.empty) extends Closeable {
22 | def modifyIterator(f: Iterator[T] => Iterator[T]): SheetData[T] = SheetData(f(rowIterator), resourcesToClose)
23 | def append(other: SheetData[T]): SheetData[T] =
24 | SheetData(rowIterator ++ other.rowIterator, resourcesToClose ++ other.resourcesToClose)
25 | override def close(): Unit = resourcesToClose.foreach(_.close())
26 | }
27 |
--------------------------------------------------------------------------------
/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | # config for log4j 1.x (spark < 3.3)
2 | log4j.rootCategory=ERROR, console
3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
4 | log4j.appender.console.target=System.out
5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
7 |
8 | # config for log4j 2.x (spark >= 3.3)
9 | # Extra logging related to initialization of Log4j
10 | # Set to debug or trace if log4j initialization is failing
11 | status = warn
12 |
13 |
14 | # Console appender configuration
15 | appender.console.type = Console
16 | appender.console.name = consoleLogger
17 | appender.console.layout.type = PatternLayout
18 | appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
19 |
20 | # Root logger level
21 | rootLogger.level = warn
22 | # Root logger referring to console appender
23 | rootLogger.appenderRef.stdout.ref = consoleLogger
24 |
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/Issue_747_plain_number.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/Issue_747_plain_number.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/apache_poi/57231_MixedGasReport.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/apache_poi/57231_MixedGasReport.xls
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/apache_poi/DataTableCities.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/apache_poi/DataTableCities.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=1/ca_03.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=1/ca_03.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_04.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_04.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_05.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_05.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_06.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=2/ca_06.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_07.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_07.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_08.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_08.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_09.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=3/ca_09.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_10.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_10.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_11.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_11.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_12.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/ca_dataset/2019/Quarter=4/ca_12.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/infer_stricter_numerical_types.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/infer_stricter_numerical_types.xls
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/infer_stricter_numerical_types.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/infer_stricter_numerical_types.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_162_nihar_gharat.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_162_nihar_gharat.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_285_bryce21.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_285_bryce21.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_463_cristichircu.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_463_cristichircu.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_942_sheetname_digits.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_942_sheetname_digits.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_944_faulty_dimension.md:
--------------------------------------------------------------------------------
1 | The issue_944_faulty_dimension.xlsx file contains `` tags on each sheet, that does not conform to the true / physical size of the sheets (e.g. `` instead of `` for sheet1).
2 |
3 | It was fabricated by hand and is used to test the library's ability to handle such cases.
4 |
5 | This is how the file was created:
6 | * take a valid excel file
7 | * rename extension from xlsx to zip
8 | * unzip it
9 | * patch the `` tags in `xl/worksheets/sheet1.xml` and `xl/worksheets/sheet2.xml`
10 | * zip it back
11 | * rename extension back to xlsx
12 |
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_944_faulty_dimension.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_944_faulty_dimension.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_965_blank_rows.md:
--------------------------------------------------------------------------------
1 | The issue_965_blank_rows.xlsx is used to test that rows containing no values are discarded if read with keepUndefinedRows == False.
2 |
3 | The Excel was fabricated by hand and is used to test the library's ability to handle such cases.
4 |
5 | This is how the file was created:
6 | * take a valid excel file
7 | * rename extension from xlsx to zip
8 | * unzip it
9 | * add empty row definitions to `xl/worksheets/sheet1.xml` (see) below)
10 | * zip it back
11 | * rename extension back to xlsx
12 |
13 |
14 | The empty row definitions added to the file are as follows:
15 | ```xml
16 |
17 |
18 |
19 |
20 | ````
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/issue_965_blank_rows.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/issue_965_blank_rows.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/plain_number.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/plain_number.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/read_multiple_sheets_at_once.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/read_multiple_sheets_at_once.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/read_multiple_sheets_at_once_noheader.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/read_multiple_sheets_at_once_noheader.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/simple_encrypted.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/simple_encrypted.xls
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/simple_encrypted.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/simple_encrypted.xlsx
--------------------------------------------------------------------------------
/src/test/resources/spreadsheets/with_errors_all_types.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/nightscape/spark-excel/a7a275798468f5ab51f6ab056dce2c2dc8ffbffa/src/test/resources/spreadsheets/with_errors_all_types.xlsx
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/DataFrameSuiteBase.scala:
--------------------------------------------------------------------------------
1 | package dev.mauch.spark
2 |
3 | import com.github.mrpowers.spark.fast.tests.DataFrameComparer
4 | import org.apache.spark.sql.{DataFrame, Row, SparkSession}
5 |
6 | import java.sql.Timestamp
7 |
8 | trait DataFrameSuiteBase extends DataFrameComparer {
9 |
10 | lazy val spark: SparkSession = SparkSession
11 | .builder()
12 | .master("local")
13 | .appName("spark-excel session")
14 | .config("spark.sql.shuffle.partitions", "1")
15 | .getOrCreate()
16 |
17 | def assertDataFrameEquals(df1: DataFrame, df2: DataFrame): Unit =
18 | assertSmallDataFrameEquality(df1, df2)
19 |
20 | def assertDataFrameApproximateEquals(expectedDF: DataFrame, actualDF: DataFrame, relTol: Double): Unit = {
21 | val e = (r1: Row, r2: Row) => {
22 | r1.equals(r2) || RelTolComparer.areRowsEqual(r1, r2, relTol)
23 | }
24 | assertLargeDatasetEquality[Row](
25 | actualDF,
26 | expectedDF,
27 | equals = e,
28 | ignoreNullable = false,
29 | ignoreColumnNames = false,
30 | orderedComparison = false
31 | )
32 | }
33 |
34 | def assertDataFrameNoOrderEquals(df1: DataFrame, df2: DataFrame): Unit =
35 | assertSmallDataFrameEquality(df1, df2, orderedComparison = false)
36 | }
37 |
38 | object RelTolComparer {
39 |
40 | trait ToNumeric[T] {
41 | def toNumeric(x: Double): T
42 | }
43 | object ToNumeric {
44 | implicit val doubleToDouble: ToNumeric[Double] = new ToNumeric[Double] {
45 | def toNumeric(x: Double): Double = x
46 | }
47 | implicit val doubleToFloat: ToNumeric[Float] = new ToNumeric[Float] {
48 | def toNumeric(x: Double): Float = x.toFloat
49 | }
50 | implicit val doubleToLong: ToNumeric[Long] = new ToNumeric[Long] {
51 | def toNumeric(x: Double): Long = x.toLong
52 | }
53 | implicit val doubleToBigDecimal: ToNumeric[BigDecimal] = new ToNumeric[BigDecimal] {
54 | def toNumeric(x: Double): BigDecimal = BigDecimal(x)
55 | }
56 | }
57 |
58 | /** Approximate equality, based on equals from [[Row]] */
59 | def areRowsEqual(r1: Row, r2: Row, relTol: Double): Boolean = {
60 | def withinRelTol[T : Numeric : ToNumeric](a: T, b: T): Boolean = {
61 | val num = implicitly[Numeric[T]]
62 | val toNum = implicitly[ToNumeric[T]]
63 | val absTol = num.times(toNum.toNumeric(relTol), num.max(num.abs(a), num.abs(b)))
64 | val diff = num.abs(num.minus(a, b))
65 | num.lteq(diff, absTol)
66 | }
67 |
68 | if (r1.length != r2.length) {
69 | return false
70 | } else {
71 | (0 until r1.length).foreach(idx => {
72 | if (r1.isNullAt(idx) != r2.isNullAt(idx)) {
73 | return false
74 | }
75 |
76 | if (!r1.isNullAt(idx)) {
77 | val o1 = r1.get(idx)
78 | val o2 = r2.get(idx)
79 | o1 match {
80 | case b1: Array[Byte] =>
81 | if (!java.util.Arrays.equals(b1, o2.asInstanceOf[Array[Byte]])) {
82 | return false
83 | }
84 |
85 | case f1: Float =>
86 | if (
87 | java.lang.Float.isNaN(f1) !=
88 | java.lang.Float.isNaN(o2.asInstanceOf[Float])
89 | ) {
90 | return false
91 | }
92 | if (!withinRelTol[Float](f1, o2.asInstanceOf[Float])) {
93 | return false
94 | }
95 |
96 | case d1: Double =>
97 | if (
98 | java.lang.Double.isNaN(d1) !=
99 | java.lang.Double.isNaN(o2.asInstanceOf[Double])
100 | ) {
101 | return false
102 | }
103 | if (!withinRelTol[Double](d1, o2.asInstanceOf[Double])) {
104 | return false
105 | }
106 |
107 | case d1: java.math.BigDecimal =>
108 | if (!withinRelTol(BigDecimal(d1), BigDecimal(o2.asInstanceOf[java.math.BigDecimal]))) {
109 | return false
110 | }
111 |
112 | case t1: Timestamp =>
113 | if (!withinRelTol(t1.getTime, o2.asInstanceOf[Timestamp].getTime)) {
114 | return false
115 | }
116 |
117 | case _ =>
118 | if (o1 != o2) return false
119 | }
120 | }
121 | })
122 | }
123 | true
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/DataLocatorSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.poi.xssf.usermodel.XSSFWorkbook
20 | import org.scalacheck.Gen
21 | import org.scalatest.funspec.AnyFunSpec
22 | import org.scalatest.matchers.should.Matchers
23 | import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks
24 | import spoiwo.model.Workbook
25 | import spoiwo.natures.xlsx.Model2XlsxConversions._
26 |
27 | import scala.jdk.CollectionConverters._
28 | import scala.collection.compat._
29 |
30 | class DataLocatorSuite extends AnyFunSpec with ScalaCheckPropertyChecks with Matchers with Generators {
31 | describe("with a table reference") {
32 | val dl = DataLocator(Map("dataAddress" -> s"$tableName[#All]"))
33 | describe("containing #All") {
34 | it("extracts the entire table data") {
35 | forAll(sheetWithTableGen) { sheet =>
36 | val actualData = dl.readFrom(sheet.convertAsXlsx()).map(_.map(_.value)).to(Seq)
37 | actualData should contain theSameElementsAs sheet.extractTableData(0)
38 | }
39 | }
40 |
41 | it("writes into a new table in a new sheet if no corresponding table exists") {
42 | forAll(sheetGenerator(withHeader = Gen.const(true), numCols = Gen.choose(1, 200))) { dataSheet =>
43 | val workbook = new XSSFWorkbook()
44 | val header = dataSheet.rows.head.cells.map(_.value.toString).toSeq
45 | val generatedSheet = dl.toSheet(
46 | header = Some(header),
47 | data = dataSheet.rows.tail.iterator.map(_.cells.map(_.value.toString).toSeq),
48 | existingWorkbook = workbook
49 | )
50 | generatedSheet.convertAsXlsx(workbook)
51 | val pTable = workbook.getTable(tableName)
52 | pTable.getSheetName should equal(tableName)
53 | pTable.getColumns.asScala.map(_.getName) should contain theSameElementsInOrderAs header
54 | val actualData = dl.readFrom(workbook).map(_.map(_.value)).to(Seq)
55 | actualData should contain theSameElementsAs dataSheet.rows.map(_.cells.map(_.value))
56 | }
57 | }
58 |
59 | it("overwrites an existing table") {
60 | forAll(sheetWithTableGen) { sheetWithTable =>
61 | val workbook = sheetWithTable.convertAsXlsx()
62 | val table = sheetWithTable.tables.head
63 | val header = table.columns.map(_.name)
64 | val tableData = dl.readFrom(workbook).map(_.map(c => s"new_$c")).toList
65 | val generatedSheet =
66 | dl.toSheet(header = tableData.headOption, data = tableData.iterator.drop(1), existingWorkbook = workbook)
67 | Workbook(generatedSheet).writeToExisting(workbook)
68 | val pTable = workbook.getTable(tableName)
69 | pTable.getSheetName should equal(sheetName)
70 | pTable.getColumns.asScala.map(_.getName) should contain theSameElementsInOrderAs header
71 | val actualData = dl.readFrom(workbook).map(_.map(_.value)).to(Seq)
72 | actualData should contain theSameElementsAs tableData
73 | }
74 | }
75 | }
76 | }
77 | describe("without any dataAddress") {
78 | it("defaults to starting at cell A1 in the first sheet") {
79 | val dl = DataLocator(Map())
80 | dl shouldBe a[CellRangeAddressDataLocator]
81 | val cradl = dl.asInstanceOf[CellRangeAddressDataLocator]
82 | cradl.dataAddress.getFirstCell.formatAsString() should equal("A1")
83 | cradl.dataAddress.getFirstCell.getSheetName should equal(null)
84 | }
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/EncryptedReadSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.types._
21 |
22 | import dev.mauch.spark.DataFrameSuiteBase
23 | import org.scalatest.funspec.AnyFunSpec
24 | import org.scalatest.matchers.should.Matchers
25 | import scala.jdk.CollectionConverters._
26 |
27 | object EncryptedReadSuite {
28 | val simpleSchema = StructType(
29 | List(
30 | StructField("A", DoubleType, true),
31 | StructField("B", DoubleType, true),
32 | StructField("C", DoubleType, true),
33 | StructField("D", DoubleType, true)
34 | )
35 | )
36 |
37 | val expectedData = List(Row(1.0d, 2.0d, 3.0d, 4.0d)).asJava
38 | }
39 |
40 | class EncryptedReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers {
41 | import EncryptedReadSuite._
42 |
43 | lazy val expected = spark.createDataFrame(expectedData, simpleSchema)
44 |
45 | def readFromResources(path: String, password: String, maxRowsInMemory: Option[Int] = None): DataFrame = {
46 | val url = getClass.getResource(path)
47 | val reader = spark.read
48 | .excel(
49 | dataAddress = s"Sheet1!A1",
50 | treatEmptyValuesAsNulls = true,
51 | workbookPassword = password,
52 | inferSchema = true
53 | )
54 | val withMaxRows = maxRowsInMemory.fold(reader)(rows => reader.option("maxRowsInMemory", s"$rows"))
55 | withMaxRows.load(url.getPath)
56 | }
57 |
58 | describe("spark-excel") {
59 | it("should read encrypted xslx file") {
60 | val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba")
61 |
62 | assertDataFrameEquals(expected, df)
63 | }
64 |
65 | it("should read encrypted xlsx file with maxRowsInMem=10") {
66 | val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba", maxRowsInMemory = Some(10))
67 |
68 | assertDataFrameEquals(expected, df)
69 | }
70 |
71 | it("should read encrypted xlsx file with maxRowsInMem=1") {
72 | val df = readFromResources("/spreadsheets/simple_encrypted.xlsx", "fooba", maxRowsInMemory = Some(1))
73 |
74 | assertDataFrameEquals(expected, df)
75 | }
76 |
77 | it("should read encrypted xls file") {
78 | val df = readFromResources("/spreadsheets/simple_encrypted.xls", "fooba")
79 |
80 | assertDataFrameEquals(expected, df)
81 | }
82 | }
83 | }
84 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/ErrorsAsStringsReadSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.types._
21 | import org.apache.spark.sql.{Row, _}
22 | import org.scalatest.funspec.AnyFunSpec
23 | import org.scalatest.matchers.should.Matchers
24 |
25 | import java.sql.Timestamp
26 | import java.time.LocalDateTime
27 | import java.util
28 | import scala.jdk.CollectionConverters._
29 |
30 | object ErrorsAsStringsReadSuite {
31 | private val dummyTimestamp = Timestamp.valueOf(LocalDateTime.of(2021, 2, 19, 0, 0))
32 | private val epochTimestamp = new Timestamp(0)
33 | private val dummyText = "hello"
34 |
35 | private val expectedSchemaInfer = StructType(
36 | List(
37 | StructField("double", DoubleType, true),
38 | StructField("boolean", BooleanType, true),
39 | StructField("timestamp", TimestampType, true),
40 | StructField("string", StringType, true),
41 | StructField("formula", StringType, true)
42 | )
43 | )
44 | private val expectedDataErrorsAsStringsInfer: util.List[Row] =
45 | List(
46 | Row(1.0, true, dummyTimestamp, dummyText, "A1"),
47 | Row(2.0, false, dummyTimestamp, dummyText, "A3"),
48 | Row(0.0, false, epochTimestamp, "", ""),
49 | Row(0.0, false, epochTimestamp, "", "")
50 | ).asJava
51 |
52 | private val expectedDataErrorsAsNullInfer: util.List[Row] =
53 | List(
54 | Row(1.0, true, dummyTimestamp, dummyText, "A1"),
55 | Row(2.0, false, dummyTimestamp, dummyText, "A3"),
56 | Row(null, null, null, null, null),
57 | Row(null, null, null, null, null)
58 | ).asJava
59 |
60 | private val expectedSchemaNonInfer = StructType(
61 | List(
62 | StructField("double", StringType, true),
63 | StructField("boolean", StringType, true),
64 | StructField("timestamp", StringType, true),
65 | StructField("string", StringType, true),
66 | StructField("formula", StringType, true)
67 | )
68 | )
69 | private val expectedDataErrorsAsStringsNonInfer: util.List[Row] =
70 | List(
71 | Row("1", "TRUE", "19\"-\"Feb\"-\"2021", dummyText, "A1"),
72 | Row("2", "FALSE", "19\"-\"Feb\"-\"2021", dummyText, "A3"),
73 | Row("", "", "", "", ""),
74 | Row("", "", "", "", "")
75 | ).asJava
76 |
77 | private val expectedDataErrorsAsNullNonInfer: util.List[Row] =
78 | List(
79 | Row("1", "TRUE", "19\"-\"Feb\"-\"2021", "hello", "A1"),
80 | Row("2", "FALSE", "19\"-\"Feb\"-\"2021", "hello", "A3"),
81 | Row(null, null, null, null, null),
82 | Row(null, null, null, null, null)
83 | ).asJava
84 |
85 | private val excelLocation = "/spreadsheets/with_errors_all_types.xlsx"
86 | }
87 |
88 | class ErrorsAsStringsReadSuite extends AnyFunSpec with DataFrameSuiteBase with Matchers {
89 | import ErrorsAsStringsReadSuite._
90 |
91 | def readFromResources(path: String, setErrorCellsToFallbackValues: Boolean, inferSchema: Boolean): DataFrame = {
92 | val url = getClass.getResource(path)
93 | spark.read
94 | .excel(setErrorCellsToFallbackValues = setErrorCellsToFallbackValues, inferSchema = inferSchema, excerptSize = 3)
95 | .load(url.getPath)
96 | }
97 |
98 | describe("spark-excel") {
99 | it("should read errors in string format when setErrorCellsToFallbackValues=true and inferSchema=true") {
100 | val df = readFromResources(excelLocation, true, true)
101 | val expected = spark.createDataFrame(expectedDataErrorsAsStringsInfer, expectedSchemaInfer)
102 | assertDataFrameEquals(expected, df)
103 | }
104 |
105 | it("should read errors as null when setErrorCellsToFallbackValues=false and inferSchema=true") {
106 | val df = readFromResources(excelLocation, false, true)
107 | val expected = spark.createDataFrame(expectedDataErrorsAsNullInfer, expectedSchemaInfer)
108 | assertDataFrameEquals(expected, df)
109 | }
110 |
111 | it("should read errors in string format when setErrorCellsToFallbackValues=true and inferSchema=false") {
112 | val df = readFromResources(excelLocation, true, false)
113 | val expected = spark.createDataFrame(expectedDataErrorsAsStringsNonInfer, expectedSchemaNonInfer)
114 | assertDataFrameEquals(expected, df)
115 | }
116 |
117 | it("should read errors in string format when setErrorCellsToFallbackValues=false and inferSchema=false") {
118 | val df = readFromResources(excelLocation, false, false)
119 | val expected = spark.createDataFrame(expectedDataErrorsAsNullNonInfer, expectedSchemaNonInfer)
120 | assertDataFrameEquals(expected, df)
121 | }
122 | }
123 | }
124 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/RichRowSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel
18 |
19 | import org.apache.poi.ss.usermodel.{Cell, Row}
20 | import org.scalacheck.Gen
21 | import org.scalacheck.Prop.propBoolean
22 | import org.scalamock.scalatest.MockFactory
23 |
24 | import scala.util.Try
25 | import org.scalatestplus.scalacheck.ScalaCheckPropertyChecks
26 | import org.scalatest.funsuite.AnyFunSuite
27 |
28 | trait RowGenerator extends MockFactory {
29 | private val MAX_WIDTH = 100
30 |
31 | protected case class GeneratedRow(start: Int, end: Int, lastCellNum: Int, row: Row)
32 |
33 | protected val rowGen: Gen[GeneratedRow] = for {
34 | startColumn <- Gen.choose(0, MAX_WIDTH - 1)
35 | endColumn <- Gen.choose(0, MAX_WIDTH - 1)
36 | lastCellNum <- Gen.choose(0, MAX_WIDTH - 1)
37 | row = stub[Row]
38 | _ = (row.getCell(_: Int)).when(*) returns stub[Cell]
39 | _ = (row.getLastCellNum _).when() returns lastCellNum.toShort
40 | } yield GeneratedRow(startColumn, endColumn, lastCellNum, row)
41 | }
42 |
43 | class RichRowSuite extends AnyFunSuite with ScalaCheckPropertyChecks with RowGenerator {
44 | test("Invalid cell range should throw an error") {
45 | forAll(rowGen) { g =>
46 | (g.start > g.end) ==> Try {
47 | g.row.eachCellIterator(g.start, g.end).next()
48 | }.isFailure
49 | }
50 | }
51 |
52 | test("Valid cell range should iterate through all non-empty cells") {
53 | forAll(rowGen) { g =>
54 | (g.start <= g.end && g.start < g.lastCellNum) ==> {
55 | val count = g.row.eachCellIterator(g.start, g.end).size
56 | count === Math.min(g.end, g.lastCellNum - 1) - g.start + 1
57 | }
58 | }
59 | }
60 |
61 | test("Valid cell range should should not iterate through non-empty cells") {
62 | forAll(rowGen) { g =>
63 | (g.start <= g.end && g.start >= g.lastCellNum) ==> {
64 | g.row.eachCellIterator(g.start, g.end).size === 0
65 | }
66 | }
67 | }
68 | }
69 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/AreaReferenceReadSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 |
27 | /** Loading data from difference data address (AreaReference)
28 | */
29 | object AreaReferenceReadSuite {
30 | val expectedSchema_01 = StructType(
31 | List(
32 | StructField("Translations!$A$370", StringType, true),
33 | StructField("Translations!$A$371", LongType, true),
34 | StructField("Translations!$A$402", DoubleType, true),
35 | StructField("Translations!$A$393", DoubleType, true),
36 | StructField("Translations!$A$384", DoubleType, true),
37 | StructField("Translations!$A$405", DoubleType, true),
38 | StructField("Translations!$A$396", DoubleType, true),
39 | StructField("Translations!$A$387", DoubleType, true),
40 | StructField("Translations!$A$418", DoubleType, true),
41 | StructField("Translations!$A$419", DoubleType, true),
42 | StructField("Translations!$A$4110", DoubleType, true)
43 | )
44 | )
45 |
46 | /* Manually checking 1 row only */
47 | val expectedData_01: util.List[Row] = List(
48 | Row(
49 | "Alabama",
50 | 140895441L,
51 | 458d,
52 | 122d,
53 | 85116d,
54 | 1009700176.36684d,
55 | 268959435.626102d,
56 | 187645502645.503d,
57 | 0.0072d,
58 | 0.0019d,
59 | 1.3318d
60 | )
61 | ).asJava
62 |
63 | }
64 |
65 | class AreaReferenceReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
66 | import AreaReferenceReadSuite._
67 |
68 | test("AreaReference from diffrence sheet with testing data from Apache POI upstream tests") {
69 | val df = readFromResources(
70 | spark,
71 | path = "apache_poi/57231_MixedGasReport.xls",
72 | options = Map("dataAddress" -> "'Coefficient Table'!A6", "ignoreAfterHeader" -> 2, "inferSchema" -> true)
73 | ).limit(1)
74 | val expected = spark.createDataFrame(expectedData_01, expectedSchema_01)
75 | assertDataFrameApproximateEquals(expected, df, 0.1e-1)
76 | }
77 |
78 | }
79 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/DataFrameWriterApiComplianceSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql._
21 | import org.scalatest.wordspec.AnyWordSpec
22 |
23 | class DataFrameWriterApiComplianceSuite extends AnyWordSpec with DataFrameSuiteBase with LocalFileTestingUtilities {
24 |
25 | private def simpleDf = {
26 | val data = Seq(("foo", "bar", "1"), ("baz", "bang", "2"))
27 | spark.createDataFrame(data).toDF("col1", "col2", "col3")
28 | }
29 |
30 | /** Checks that the excel data files in given folder equal the provided dataframe */
31 | private def assertWrittenExcelData(expectedDf: DataFrame, folder: String): Unit = {
32 | val actualDf = spark.read
33 | .format("excel")
34 | .option("path", folder)
35 | .load()
36 |
37 | /* assertDataFrameNoOrderEquals is sensitive to order of columns, so we
38 | order both dataframes in the same way
39 | */
40 | val orderedSchemaColumns = expectedDf.schema.fields.map(f => f.name).sorted
41 |
42 | assertDataFrameNoOrderEquals(
43 | expectedDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*),
44 | actualDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
45 | )
46 |
47 | }
48 | "excel v2 complies to DataFrameWriter SaveMode and Partitioning behavior" can {
49 |
50 | val writeModes = Seq(SaveMode.Overwrite, SaveMode.Append)
51 |
52 | for (writeMode <- writeModes) {
53 | s"write a dataframe to xlsx with ${writeMode.toString}" in withExistingCleanTempDir("v2") { targetDir =>
54 | // create a df from csv then write as xlsx
55 | val df = simpleDf
56 |
57 | df.write
58 | .format("excel")
59 | .option("path", targetDir)
60 | .option("header", value = true)
61 | .mode(writeMode)
62 | .save()
63 |
64 | val listOfFiles = getListOfFilesFilteredByExtension(targetDir, "xlsx")
65 | assert(listOfFiles.nonEmpty, s"expected at least one excel file")
66 |
67 | // is the result really the same?
68 | assertWrittenExcelData(df, targetDir)
69 |
70 | }
71 | s"write a dataframe to xlsx with ${writeMode.toString} (partitioned)" in withExistingCleanTempDir("v2") {
72 | targetDir =>
73 | assume(spark.sparkContext.version >= "3.0.1")
74 | // create a df from csv then write as xlsx
75 | val df = simpleDf
76 |
77 | df.write
78 | .partitionBy("col1")
79 | .format("excel")
80 | .option("path", targetDir)
81 | .option("header", value = true)
82 | .mode(writeMode)
83 | .save()
84 |
85 | // some file based checks
86 | val listOfFolders = getListOfFolders(targetDir)
87 | assert(listOfFolders.length == 2, s"expected two folders because there are two partitions")
88 | for (folder <- listOfFolders) {
89 | assert(folder.getName.startsWith("col1="), s"expected partition folders and those must start with col1=")
90 | val listOfFiles = getListOfFilesFilteredByExtension(folder.getAbsolutePath, "xlsx")
91 | assert(listOfFiles.nonEmpty, s"expected at least one xlsx per folder but got $listOfFiles")
92 | }
93 |
94 | // is the result really the same?
95 | assertWrittenExcelData(df, targetDir)
96 |
97 | }
98 | }
99 |
100 | for (isPartitioned <- Seq(false, true)) {
101 | s"multiple appends to folder (partitioned == $isPartitioned)" in withExistingCleanTempDir("v2") { targetDir =>
102 | if (isPartitioned) {
103 | assume(spark.sparkContext.version >= "3.0.1")
104 | }
105 |
106 | val df = simpleDf
107 |
108 | val dfWriter = if (isPartitioned) df.write else df.write.partitionBy("col1")
109 |
110 | dfWriter
111 | .format("excel")
112 | .option("path", targetDir)
113 | .option("header", value = true)
114 | .mode(SaveMode.Append)
115 | .save()
116 | dfWriter
117 | .format("excel")
118 | .option("path", targetDir)
119 | .option("header", value = true)
120 | .mode(SaveMode.Append)
121 | .save()
122 |
123 | val orderedSchemaColumns = df.schema.fields.map(f => f.name).sorted
124 | val expectedDf =
125 | df.union(df).select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
126 |
127 | assertWrittenExcelData(expectedDf, targetDir)
128 | }
129 | }
130 | }
131 | }
132 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/EncryptedReadSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql._
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import scala.jdk.CollectionConverters._
25 |
26 | object EncryptedReadSuite {
27 | val simpleSchema = StructType(
28 | List(
29 | StructField("A", IntegerType, true),
30 | StructField("B", IntegerType, true),
31 | StructField("C", IntegerType, true),
32 | StructField("D", IntegerType, true)
33 | )
34 | )
35 |
36 | val expectedData = List(Row(1, 2, 3, 4)).asJava
37 | }
38 |
39 | class EncryptedReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
40 | import EncryptedReadSuite._
41 |
42 | lazy val expected = spark.createDataFrame(expectedData, simpleSchema)
43 |
44 | test("read encrypted xslx file") {
45 | val df = readFromResources(
46 | spark,
47 | path = "simple_encrypted.xlsx",
48 | options = Map(
49 | "dataAddress" -> "Sheet1!A1",
50 | "treatEmptyValuesAsNulls" -> true,
51 | "workbookPassword" -> "fooba",
52 | "inferSchema" -> true
53 | )
54 | )
55 | assertDataFrameEquals(expected, df)
56 | }
57 |
58 | test("read encrypted xslx file (maxRowsInMemory)") {
59 | val df = readFromResources(
60 | spark,
61 | path = "simple_encrypted.xlsx",
62 | options = Map(
63 | "dataAddress" -> "Sheet1!A1",
64 | "treatEmptyValuesAsNulls" -> true,
65 | "workbookPassword" -> "fooba",
66 | "maxRowsInMemory" -> 1,
67 | "inferSchema" -> true
68 | )
69 | )
70 | assertDataFrameEquals(expected, df)
71 | }
72 |
73 | test("read encrypted xls file") {
74 | val df = readFromResources(
75 | spark,
76 | path = "simple_encrypted.xls",
77 | options = Map(
78 | "dataAddress" -> "Sheet1!A1",
79 | "treatEmptyValuesAsNulls" -> true,
80 | "workbookPassword" -> "fooba",
81 | "inferSchema" -> true
82 | )
83 | )
84 | assertDataFrameEquals(expected, df)
85 | }
86 | }
87 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/ErrorsAsStringsReadSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.sql.Timestamp
25 | import java.time.LocalDateTime
26 | import java.util
27 | import scala.jdk.CollectionConverters._
28 |
29 | object ErrorsAsStringsReadSuite {
30 | private val dummyTimestamp = Timestamp.valueOf(LocalDateTime.of(2021, 2, 19, 0, 0))
31 | private val dummyText = "hello"
32 |
33 | private val expectedSchemaInfer = StructType(
34 | List(
35 | StructField("double", IntegerType, true),
36 | StructField("boolean", BooleanType, true),
37 | StructField("timestamp", TimestampType, true),
38 | StructField("string", StringType, true),
39 | StructField("formula", StringType, true)
40 | )
41 | )
42 |
43 | private val expectedDataErrorsAsNullInfer: util.List[Row] = List(
44 | Row(1, true, dummyTimestamp, dummyText, "A1"),
45 | Row(2, false, dummyTimestamp, dummyText, "A3"),
46 | Row(null, null, null, null, null),
47 | Row(null, null, null, null, null)
48 | ).asJava
49 |
50 | private val expectedDataErrorsAsStringsInfer: util.List[Row] = List(
51 | Row(1, true, dummyTimestamp, dummyText, "A1"),
52 | Row(2, false, dummyTimestamp, dummyText, "A3"),
53 | Row(null, null, null, "#NULL!", "#DIV/0!"),
54 | Row(null, null, null, "#N/A", "#NAME?")
55 | ).asJava
56 |
57 | private val expectedSchemaNonInfer = StructType(
58 | List(
59 | StructField("double", StringType, true),
60 | StructField("boolean", StringType, true),
61 | StructField("timestamp", StringType, true),
62 | StructField("string", StringType, true),
63 | StructField("formula", StringType, true)
64 | )
65 | )
66 |
67 | private val expectedDataErrorsAsNullNonInfer: util.List[Row] = List(
68 | Row("1", "TRUE", """19"-"Feb"-"2021""", "hello", "A1"),
69 | Row("2", "FALSE", """19"-"Feb"-"2021""", "hello", "A3"),
70 | Row(null, null, null, null, null),
71 | Row(null, null, null, null, null)
72 | ).asJava
73 |
74 | private val expectedDataErrorsAsStringsNonInfer: util.List[Row] = List(
75 | Row("1", "TRUE", """19"-"Feb"-"2021""", dummyText, "A1"),
76 | Row("2", "FALSE", """19"-"Feb"-"2021""", dummyText, "A3"),
77 | Row("#NULL!", "#NULL!", "#NULL!", "#NULL!", "#DIV/0!"),
78 | Row("#N/A", "#N/A", "#N/A", "#N/A", "#NAME?")
79 | ).asJava
80 | }
81 |
82 | /** Breaking change with V1: For Spark String Type field, Error Cell has an option to either get error value or null as
83 | * any other Spark Types
84 | *
85 | * Related issues: Support ERROR cell type when using inferSchema=true link:
86 | * https://github.dev/mauch/spark-excel/pull/343
87 | */
88 | class ErrorsAsStringsReadSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
89 | import ErrorsAsStringsReadSuite._
90 |
91 | test("error cells as null when useNullForErrorCells=true and inferSchema=true") {
92 | val df = readFromResources(
93 | spark,
94 | path = "with_errors_all_types.xlsx",
95 | options = Map("inferSchema" -> true, "useNullForErrorCells" -> true)
96 | )
97 | val expected = spark.createDataFrame(expectedDataErrorsAsNullInfer, expectedSchemaInfer)
98 | assertDataFrameEquals(expected, df)
99 | }
100 |
101 | test("errors as null for non-string type with useNullForErrorCells=false and inferSchema=true") {
102 | val df = readFromResources(
103 | spark,
104 | path = "with_errors_all_types.xlsx",
105 | options = Map("inferSchema" -> true, "useNullForErrorCells" -> false)
106 | )
107 | val expected = spark.createDataFrame(expectedDataErrorsAsStringsInfer, expectedSchemaInfer)
108 | assertDataFrameEquals(expected, df)
109 | }
110 |
111 | test("errors in string format when useNullForErrorCells=true and inferSchema=false") {
112 | val df = readFromResources(
113 | spark,
114 | path = "with_errors_all_types.xlsx",
115 | options = Map("inferSchema" -> false, "useNullForErrorCells" -> true)
116 | )
117 | val expected = spark.createDataFrame(expectedDataErrorsAsNullNonInfer, expectedSchemaNonInfer)
118 | assertDataFrameEquals(expected, df)
119 | }
120 |
121 | test("errors in string format when useNullForErrorCells=false and inferSchema=false") {
122 | val df = readFromResources(
123 | spark,
124 | path = "with_errors_all_types.xlsx",
125 | options = Map("inferSchema" -> false, "useNullForErrorCells" -> false)
126 | )
127 | val expected = spark
128 | .createDataFrame(expectedDataErrorsAsStringsNonInfer, expectedSchemaNonInfer)
129 | assertDataFrameEquals(expected, df)
130 | }
131 |
132 | }
133 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/ExcelTestingUtilities.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import org.apache.spark.sql._
20 | import org.apache.spark.sql.types.StructType
21 | import scala.reflect.io.Directory
22 | import java.io.File
23 |
24 | trait ExcelTestingUtilities {
25 |
26 | private val dataRoot = getClass.getResource("/spreadsheets").getPath
27 |
28 | /** Load excel data from resource folder
29 | *
30 | * @param spark
31 | * spark session
32 | * @param path
33 | * relative path to the resource/speadsheets
34 | * @param options
35 | * extra loading option
36 | * @return
37 | * data frame
38 | */
39 | def readFromResources(spark: SparkSession, path: String, options: Map[String, Any]): DataFrame =
40 | spark.read
41 | .format("excel")
42 | .options(options.map(p => (p._1 -> p._2.toString())))
43 | .load(s"$dataRoot/$path")
44 |
45 | /** Load excel data from resource folder with user defined schema
46 | *
47 | * @param spark
48 | * spark session
49 | * @param path
50 | * relative path to the resource/speadsheets
51 | * @param options
52 | * extra loading option
53 | * @param schema
54 | * user provided schema
55 | * @return
56 | * data frame
57 | */
58 | def readFromResources(spark: SparkSession, path: String, options: Map[String, Any], schema: StructType): DataFrame =
59 | spark.read
60 | .format("excel")
61 | .options(options.map(p => (p._1 -> p._2.toString())))
62 | .schema(schema)
63 | .load(s"$dataRoot/$path")
64 |
65 | /** Delete directory recursively. Intended for temporary testing data only. Use with causion!
66 | *
67 | * @param path
68 | * to be deleted
69 | */
70 | def deleteDirectory(path: String): Unit = {
71 | val directory = new Directory(new File(path))
72 | directory.deleteRecursively()
73 | ()
74 | }
75 | }
76 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/GlobPartitionAndFileNameSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.functions.input_file_name
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | /** Issue References:
25 | *
26 | * #52. input_file_name returns empty string https://github.dev/mauch/spark-excel/issues/52
27 | *
28 | * #74. Allow reading multiple files specified as a list OR by a pattern https://github.dev/mauch/spark-excel/issues/74
29 | *
30 | * #97. Reading multiple files https://github.dev/mauch/spark-excel/issues/97
31 | */
32 |
33 | object GlobPartitionAndFileNameSuite {
34 | val expectedInferredSchema = StructType(
35 | List(
36 | StructField("Day", IntegerType, true),
37 | StructField("Month", IntegerType, true),
38 | StructField("Customer ID", StringType, true),
39 | StructField("Customer Name", StringType, true),
40 | StructField("Standard Package", IntegerType, true),
41 | StructField("Extra Option 1", IntegerType, true),
42 | StructField("Extra Option 2", IntegerType, true),
43 | StructField("Extra Option 3", IntegerType, true),
44 | StructField("Staff", StringType, true)
45 | )
46 | )
47 |
48 | val expectedWithFilenameSchema = StructType(
49 | List(
50 | StructField("Day", IntegerType, true),
51 | StructField("Month", IntegerType, true),
52 | StructField("Customer ID", StringType, true),
53 | StructField("Customer Name", StringType, true),
54 | StructField("Standard Package", IntegerType, true),
55 | StructField("Extra Option 1", IntegerType, true),
56 | StructField("Extra Option 2", IntegerType, true),
57 | StructField("Extra Option 3", IntegerType, true),
58 | StructField("Staff", StringType, true),
59 | StructField("file_name", StringType, false)
60 | )
61 | )
62 |
63 | val expectedWithPartitionSchema = StructType(
64 | List(
65 | StructField("Day", IntegerType, true),
66 | StructField("Month", IntegerType, true),
67 | StructField("Customer ID", StringType, true),
68 | StructField("Customer Name", StringType, true),
69 | StructField("Standard Package", IntegerType, true),
70 | StructField("Extra Option 1", IntegerType, true),
71 | StructField("Extra Option 2", IntegerType, true),
72 | StructField("Extra Option 3", IntegerType, true),
73 | StructField("Staff", StringType, true),
74 | StructField("Quarter", IntegerType, true)
75 | )
76 | )
77 | }
78 |
79 | class GlobPartitionAndFileNameSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
80 | import GlobPartitionAndFileNameSuite._
81 |
82 | private val sharedOptions = Map("header" -> true, "inferSchema" -> true)
83 |
84 | test("read multiple files must infer correct schema with inferSchema=true") {
85 | val df = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions)
86 | assert(df.schema == expectedInferredSchema)
87 | }
88 |
89 | test("read multiple files with input_file_name") {
90 | val df = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions)
91 | .withColumn("file_name", input_file_name())
92 | assert(df.schema == expectedWithFilenameSchema)
93 |
94 | /* And validate list of filename */
95 | val names = df
96 | .select("file_name")
97 | .distinct()
98 | .collect()
99 | .map(r => r.getString(0))
100 | .map(p => p.split("[\\/]").last) // this works on Windows too
101 | .toSet
102 | assert(names == Set[String]("ca_10.xlsx", "ca_11.xlsx", "ca_12.xlsx"))
103 | }
104 |
105 | test("read whole folder with partition") {
106 | val df = readFromResources(spark, "ca_dataset/2019", sharedOptions)
107 | assert(df.schema == expectedWithPartitionSchema)
108 |
109 | /* And validate list of Quarters */
110 | val quarters = df.select("Quarter").distinct().collect().map(r => r.getInt(0)).toSet
111 | assert(quarters == Set[Int](1, 2, 3, 4))
112 | }
113 |
114 | test("read multiple files must has same number total number of rows") {
115 | val q4_total = readFromResources(spark, "ca_dataset/2019/Quarter=4/*.xlsx", sharedOptions)
116 | .count()
117 |
118 | val q4_sum = Seq("ca_10.xlsx", "ca_11.xlsx", "ca_12.xlsx")
119 | .map(name => readFromResources(spark, s"ca_dataset/2019/Quarter=4/$name", sharedOptions).count())
120 | .sum
121 |
122 | assert(q4_total > 0)
123 | assert(q4_total == q4_sum)
124 | }
125 |
126 | }
127 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/InferStricterNumericalTypesSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 |
27 | object InferStricterNumericalTypesSuite {
28 | val expectedInferredSchema = StructType(
29 | List(
30 | StructField("ID", StringType, true),
31 | StructField("Integer Value Range", IntegerType, true),
32 | StructField("Long Value Range", LongType, true),
33 | StructField("Double Value Range", DoubleType, true)
34 | )
35 | )
36 |
37 | /** Stricter type for numerical value
38 | */
39 | val expectedDataInferSchema: util.List[Row] = List(
40 | Row("Gas & Oil", 2147482967, 92147483647L, 90315085.71d),
41 | Row("Telecomunication", 2147483099, 102147483647L, -965079398.74d),
42 | Row("Manufacturing", 2147482826, 112147483647L, -353020871.56d),
43 | Row("Farming", 2147482838, -102147483647L, -446026564.15d),
44 | Row("Service", 2147483356, -112147483647L, -820766945.73d)
45 | ).asJava
46 | }
47 |
48 | class InferStricterNumericalTypesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
49 | import InferStricterNumericalTypesSuite._
50 |
51 | test("stricter numerical types usePlainNumberFormat=true and inferSchema=true (xlxs)") {
52 | val df = readFromResources(
53 | spark,
54 | path = "infer_stricter_numerical_types.xlsx",
55 | options = Map("usePlainNumberFormat" -> true, "inferSchema" -> true)
56 | )
57 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
58 | assertDataFrameEquals(expected, df)
59 | }
60 |
61 | test("stricter numerical types usePlainNumberFormat=false and inferSchema=true (xlxs)") {
62 | val df = readFromResources(
63 | spark,
64 | path = "infer_stricter_numerical_types.xlsx",
65 | options = Map("usePlainNumberFormat" -> false, "inferSchema" -> true)
66 | )
67 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
68 | assertDataFrameEquals(expected, df)
69 | }
70 |
71 | test("stricter numerical types usePlainNumberFormat=true and inferSchema=true (xls)") {
72 | val df = readFromResources(
73 | spark,
74 | path = "infer_stricter_numerical_types.xls",
75 | options = Map("usePlainNumberFormat" -> true, "inferSchema" -> true)
76 | )
77 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
78 | assertDataFrameEquals(expected, df)
79 | }
80 |
81 | test("stricter numerical types usePlainNumberFormat=false and inferSchema=true (xls)") {
82 | val df = readFromResources(
83 | spark,
84 | path = "infer_stricter_numerical_types.xls",
85 | options = Map("usePlainNumberFormat" -> false, "inferSchema" -> true)
86 | )
87 | val expected = spark.createDataFrame(expectedDataInferSchema, expectedInferredSchema)
88 | assertDataFrameEquals(expected, df)
89 | }
90 | }
91 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/KeepUndefinedRowsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 |
27 | object KeepUndefinedRowsSuite {
28 |
29 | /* Issue: https://github.dev/mauch/spark-excel/issues/285 */
30 | val expectedSchema_Issue285 = StructType(
31 | List(StructField("1", StringType, true), StructField("2", StringType, true), StructField("3", StringType, true))
32 | )
33 |
34 | /** No change to the spark-excel, Apache POI also produce same result with sheet.iterator
35 | *
36 | * Workaround: https://stackoverflow.com/questions/47790569/how-to-avoid-skipping-blank-rows-or-columns-in-apache-poi
37 | * Doc: http://poi.apache.org/components/spreadsheet/quick-guide.html#Iterator
38 | */
39 | val expectedData_Issue285: util.List[Row] = List(
40 | Row("File info", null, null),
41 | Row("Info", "Info", "Info"),
42 | Row("Metadata", null, null),
43 | Row(null, "1", "2"),
44 | Row("A", "1", "2"),
45 | Row("B", "5", "6"),
46 | Row("C", "9", "10"),
47 | Row("Metadata", null, null),
48 | Row(null, "1", "2"),
49 | Row("A", "1", "2"),
50 | Row("B", "4", "5"),
51 | Row("C", "7", "8")
52 | ).asJava
53 |
54 | /* With newly introduced keepUndefinedRows option */
55 | val expectedData_KeepUndefinedRows_Issue285: util.List[Row] = List(
56 | Row("File info", null, null),
57 | Row("Info", "Info", "Info"),
58 | Row(null, null, null),
59 | Row("Metadata", null, null),
60 | Row(null, null, null),
61 | Row(null, "1", "2"),
62 | Row("A", "1", "2"),
63 | Row("B", "5", "6"),
64 | Row("C", "9", "10"),
65 | Row(null, null, null),
66 | Row(null, null, null),
67 | Row("Metadata", null, null),
68 | Row(null, null, null),
69 | Row(null, "1", "2"),
70 | Row("A", "1", "2"),
71 | Row("B", "4", "5"),
72 | Row("C", "7", "8")
73 | ).asJava
74 |
75 | /** Issue: https://github.dev/mauch/spark-excel/issues/162 Spark-excel still infers to Double-Type, however, user can
76 | * provide custom scheme and Spark-excel should load to IntegerType or LongType accordingly
77 | */
78 | val userDefined_Issue162 = StructType(
79 | List(
80 | StructField("ID", IntegerType, true),
81 | StructField("address", StringType, true),
82 | StructField("Pin", IntegerType, true)
83 | )
84 | )
85 |
86 | val expectedData_Issue162: util.List[Row] =
87 | List(Row(123123, "Asdadsas, Xyxyxy, 123xyz", 123132), Row(123124, "Asdadsas1, Xyxyxy, 123xyz", 123133)).asJava
88 |
89 | }
90 |
91 | class KeepUndefinedRowsSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
92 | import KeepUndefinedRowsSuite._
93 |
94 | test("#285 undefined rows: no keep") {
95 | val df = readFromResources(
96 | spark,
97 | path = "issue_285_bryce21.xlsx",
98 | options = Map("header" -> false, "inferSchema" -> false, "keepUndefinedRows" -> false),
99 | schema = expectedSchema_Issue285
100 | )
101 | val expected = spark.createDataFrame(expectedData_Issue285, expectedSchema_Issue285)
102 | assertDataFrameEquals(expected, df)
103 | }
104 |
105 | test("#162 load integer values with user defined schema") {
106 | val df = readFromResources(
107 | spark,
108 | path = "issue_162_nihar_gharat.xlsx",
109 | options = Map("header" -> true),
110 | schema = userDefined_Issue162
111 | )
112 | val expected = spark.createDataFrame(expectedData_Issue162, userDefined_Issue162)
113 | assertDataFrameEquals(expected, df)
114 | }
115 |
116 | for (sheetName <- Seq("blank_row", "space_row")) {
117 | test(s"#965 handling of NULL/BLANK column values (streamingReader, keepUndefinedRows==false, sheet=$sheetName)") {
118 | val df = readFromResources(
119 | spark,
120 | path = "issue_965_blank_rows.xlsx",
121 | options = Map(
122 | "dataAddress" -> s"'${sheetName}'!A1",
123 | "inferSchema" -> true,
124 | "header" -> true,
125 | "maxRowsInMemory" -> "1000",
126 | "keepUndefinedRows" -> false
127 | )
128 | )
129 | assert(df.schema.fields.length == 5) // sheet 001 has 5 columns
130 | /*
131 | sheet "blank_row" has row 2 and 4 defined, while row 3 is not defined in excel xml and row 5 contains empty cells in excel xml
132 | => 2 rows in total (prior the fix row 5 was added as well)
133 | sheet "space_row" has row 2 and 4 defined with some values, row 3 contains just a whitespace in A3
134 | => 3 rows in total (just to test that a single whitespace is handled correctly)
135 | */
136 | if (sheetName == "blank_row") {
137 | assert(df.count() == 2)
138 | } else {
139 | assert(df.count() == 3)
140 | }
141 | }
142 | }
143 |
144 | for (keepUndefinedRows <- Seq(false, true)) {
145 | test(s"#965 handling of NULL/BLANK column values (NON-streaming-Reader, keepUndefinedRows==$keepUndefinedRows)") {
146 | val df = readFromResources(
147 | spark,
148 | path = "issue_965_blank_rows.xlsx",
149 | options = Map(
150 | "dataAddress" -> s"'blank_row'!A1",
151 | "inferSchema" -> true,
152 | "header" -> true,
153 | "keepUndefinedRows" -> keepUndefinedRows
154 | )
155 | )
156 | assert(df.schema.fields.length == 5) // sheet 001 has 5 columns
157 | /*
158 | sheet "blank_row" has row 2 and 4 defined, while row 3 is not defined in excel xml and row 5 contains empty cells in excel xml
159 | * keepUndefinedRows == true => 4 rows in total
160 | * keepUndefinedRows == false => 2 rows in total
161 | */
162 | if (keepUndefinedRows) {
163 | assert(df.count() == 4)
164 | } else {
165 | assert(df.count() == 2)
166 | }
167 | }
168 | }
169 |
170 | }
171 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/LocalFileTestingUtilities.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import java.io.File
20 | import java.nio.file.Files
21 |
22 | trait LocalFileTestingUtilities {
23 |
24 | /** Returns the list of files in given directory/folder (this is not recursive)
25 | */
26 | def getListOfFiles(folder: String): List[File] = {
27 | val d = new File(folder)
28 | if (d.exists && d.isDirectory) {
29 | d.listFiles.filter(_.isFile).toList
30 | } else {
31 | List[File]()
32 | }
33 | }
34 |
35 | /** similar to getListOfFiles but filters the files by the given file extension */
36 | def getListOfFilesFilteredByExtension(targetDir: String, filteredByExtension: String): Seq[String] = {
37 | val filesInTargetDir = getListOfFiles(targetDir)
38 | filesInTargetDir.filter(_.getName.endsWith(filteredByExtension)).map(_.getName)
39 | }
40 |
41 | /** Returns the list of folders in given directory/folder (this is not recursive */
42 | def getListOfFolders(folder: String): List[File] = {
43 | val d = new File(folder)
44 | if (d.exists && d.isDirectory) {
45 | d.listFiles.filter(_.isDirectory).toList
46 | } else {
47 | List[File]()
48 | }
49 | }
50 |
51 | /** Deletes the (non-empty) directory (recursively)
52 | */
53 | def deleteDirectoryRecursively(folderToDelete: File): Unit = {
54 | val allContents = folderToDelete.listFiles
55 | if (allContents != null) for (file <- allContents) {
56 | deleteDirectoryRecursively(file)
57 | }
58 | folderToDelete.delete
59 | ()
60 | }
61 |
62 | /** fixture that creates a temporary folder and deletes it after test completion */
63 | def withExistingCleanTempDir(name: String): (String => Unit) => Unit = {
64 |
65 | def fixture(testCode: String => Unit): Unit = {
66 |
67 | val directory = Files.createTempDirectory(name)
68 |
69 | try testCode(directory.toString)
70 | finally deleteDirectoryRecursively(directory.toFile)
71 | }
72 |
73 | fixture
74 | }
75 |
76 | }
77 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/ManyPartitionReadSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql._
21 | import org.apache.spark.sql.functions.col
22 | import org.apache.spark.sql.types.IntegerType
23 | import org.scalatest.wordspec.AnyWordSpec
24 | import org.apache.spark.sql.types.{StringType, StructField, StructType}
25 |
26 | class ManyPartitionReadSuite extends AnyWordSpec with DataFrameSuiteBase with LocalFileTestingUtilities {
27 |
28 | /** Checks that the excel data files in given folder equal the provided dataframe */
29 | private def assertWrittenExcelData(expectedDf: DataFrame, folder: String): Unit = {
30 | val actualDf = spark.read
31 | .format("excel")
32 | .option("path", folder)
33 | .load()
34 |
35 | /* assertDataFrameNoOrderEquals is sensitive to order of columns, so we
36 | order both dataframes in the same way
37 | */
38 | val orderedSchemaColumns = expectedDf.schema.fields.map(f => f.name).sorted
39 |
40 | assertDataFrameNoOrderEquals(
41 | expectedDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*),
42 | actualDf.select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
43 | )
44 |
45 | }
46 |
47 | def createExpected(targetDir: String): DataFrame = {
48 |
49 | // Generate data programmatically
50 | val data = (1 to 19).flatMap { col1 =>
51 | // Each col1 value has multiple rows (around 10-11 rows each)
52 | val rowsPerPartition = if (col1 == 1) 8 else if (col1 == 2) 16 else 11
53 | (0 until rowsPerPartition).map { i =>
54 | val index = (col1 - 1) * 11 + i + 1234 // Starting from 1234 as in original data
55 | Row(
56 | Integer.valueOf(col1), // Make it nullable Integer
57 | s"fubar_$index",
58 | s"bazbang_${index + 77000}",
59 | s"barfang_${index + 237708}",
60 | s"lorem_ipsum_$index"
61 | )
62 | }
63 | }
64 |
65 | // Define schema explicitly to match expected nullability
66 | val schema = StructType(
67 | Array(
68 | StructField("col1", IntegerType, nullable = true),
69 | StructField("col2", StringType, nullable = true),
70 | StructField("col3", StringType, nullable = true),
71 | StructField("col4", StringType, nullable = true),
72 | StructField("col5", StringType, nullable = true)
73 | )
74 | )
75 |
76 | val dfInput = spark.createDataFrame(spark.sparkContext.parallelize(data), schema)
77 |
78 | val dfFinal = dfInput.union(dfInput)
79 |
80 | val dfWriter = dfFinal.write
81 | .partitionBy("col1")
82 | .format("excel")
83 | .option("path", targetDir)
84 | .option("header", value = true)
85 | .mode(SaveMode.Append)
86 |
87 | dfWriter.save()
88 | dfWriter.save()
89 |
90 | val orderedSchemaColumns = dfInput.schema.fields.map(f => f.name).sorted
91 |
92 | dfFinal
93 | .union(dfFinal)
94 | .withColumn("col1", col("col1").cast(IntegerType))
95 | .select(orderedSchemaColumns.head, orderedSchemaColumns.tail.toIndexedSeq: _*)
96 |
97 | }
98 |
99 | for (run <- Range(0, 3)) {
100 |
101 | s"many partitions read (run=$run)" in withExistingCleanTempDir("v2") { targetDir =>
102 | assume(spark.sparkContext.version >= "3.0.1")
103 | val expectedDf = createExpected(targetDir)
104 | assertWrittenExcelData(expectedDf, targetDir)
105 | }
106 | }
107 |
108 | }
109 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/NumericTypesSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 |
27 | /** For schema infering as well as loading for various numeric types {Integer, Long, Double}
28 | */
29 | object NumericTypesSuite {
30 |
31 | val userDefinedSchema_01 = StructType(
32 | List(
33 | StructField("Day", IntegerType, true),
34 | StructField("Month", IntegerType, true),
35 | StructField("Customer ID", StringType, true),
36 | StructField("Customer Name", StringType, true),
37 | StructField("Standard Package", IntegerType, true),
38 | StructField("Extra Option 1", IntegerType, true),
39 | StructField("Extra Option 2", IntegerType, true),
40 | StructField("Extra Option 3", IntegerType, true),
41 | StructField("Staff", StringType, true)
42 | )
43 | )
44 |
45 | val expectedData_01: util.List[Row] = List(
46 | Row(1, 12, "CA869", "Phạm Uyển Trinh", null, null, 2200, null, "Ella Fitzgerald"),
47 | Row(1, 12, "CA870", "Nguyễn Liên Thảo", null, null, 2000, 1350, "Ella Fitzgerald"),
48 | Row(1, 12, "CA871", "Lê Thị Nga", 17000, null, null, null, "Ella Fitzgerald"),
49 | Row(1, 12, "CA872", "Phan Tố Nga", null, null, 2000, null, "Teresa Teng"),
50 | Row(1, 12, "CA873", "Nguyễn Thị Teresa Teng", null, null, 1200, null, "Jesse Thomas")
51 | ).asJava
52 |
53 | val userDefinedSchema_02 = StructType(
54 | List(
55 | StructField("Day", LongType, true),
56 | StructField("Month", LongType, true),
57 | StructField("Customer ID", StringType, true),
58 | StructField("Customer Name", StringType, true),
59 | StructField("Standard Package", IntegerType, true),
60 | StructField("Extra Option 1", IntegerType, true),
61 | StructField("Extra Option 2", IntegerType, true),
62 | StructField("Extra Option 3", LongType, true),
63 | StructField("Staff", StringType, true)
64 | )
65 | )
66 |
67 | val expectedData_02: util.List[Row] = List(
68 | Row(1L, 12L, "CA869", "Phạm Uyển Trinh", null, null, 2200, null, "Ella Fitzgerald"),
69 | Row(1L, 12L, "CA870", "Nguyễn Liên Thảo", null, null, 2000, 1350L, "Ella Fitzgerald"),
70 | Row(1L, 12L, "CA871", "Lê Thị Nga", 17000, null, null, null, "Ella Fitzgerald"),
71 | Row(1L, 12L, "CA872", "Phan Tố Nga", null, null, 2000, null, "Teresa Teng"),
72 | Row(1L, 12L, "CA873", "Nguyễn Thị Teresa Teng", null, null, 1200, null, "Jesse Thomas")
73 | ).asJava
74 | }
75 |
76 | class NumericTypesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
77 | import NumericTypesSuite._
78 |
79 | test("load with user defined schema with Integer types") {
80 | val df = readFromResources(
81 | spark,
82 | path = "ca_dataset/2019/Quarter=4/ca_12.xlsx",
83 | options = Map("header" -> true),
84 | schema = userDefinedSchema_01
85 | ).limit(5)
86 | val expected = spark.createDataFrame(expectedData_01, userDefinedSchema_01)
87 |
88 | assertDataFrameEquals(expected, df)
89 | }
90 |
91 | test("load with user defined schema with both Integer and Long types") {
92 | val df = readFromResources(
93 | spark,
94 | path = "ca_dataset/2019/Quarter=4/ca_12.xlsx",
95 | options = Map("header" -> true),
96 | schema = userDefinedSchema_02
97 | ).limit(5)
98 | val expected = spark.createDataFrame(expectedData_02, userDefinedSchema_02)
99 |
100 | assertDataFrameEquals(expected, df)
101 | }
102 | }
103 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/RowNumberColumnSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 |
27 | /** Related issues: #40 Allow reading only a subset of rows https://github.dev/mauch/spark-excel/issues/40 #59 Rows are
28 | * returned in incorrect order on cluster https://github.dev/mauch/spark-excel/issues/59 #115 Add excel row number
29 | * column https://github.dev/mauch/spark-excel/issues/115
30 | */
31 | object RowNumberColumnSuite {
32 |
33 | val expectedSchema = StructType(
34 | List(
35 | StructField("RowID", IntegerType, true),
36 | StructField("1", StringType, true),
37 | StructField("2", StringType, true),
38 | StructField("3", StringType, true)
39 | )
40 | )
41 |
42 | val expectedData_NoKeep: util.List[Row] = List(
43 | Row(0, "File info", null, null),
44 | Row(1, "Info", "Info", "Info"),
45 | Row(3, "Metadata", null, null),
46 | Row(5, null, "1", "2"),
47 | Row(6, "A", "1", "2"),
48 | Row(7, "B", "5", "6"),
49 | Row(8, "C", "9", "10"),
50 | Row(11, "Metadata", null, null),
51 | Row(13, null, "1", "2"),
52 | Row(14, "A", "1", "2"),
53 | Row(15, "B", "4", "5"),
54 | Row(16, "C", "7", "8")
55 | ).asJava
56 |
57 | val expectedData_Keep: util.List[Row] = List(
58 | Row(0, "File info", null, null),
59 | Row(1, "Info", "Info", "Info"),
60 | Row(null, null, null, null),
61 | Row(3, "Metadata", null, null),
62 | Row(null, null, null, null),
63 | Row(5, null, "1", "2"),
64 | Row(6, "A", "1", "2"),
65 | Row(7, "B", "5", "6"),
66 | Row(8, "C", "9", "10"),
67 | Row(null, null, null, null),
68 | Row(null, null, null, null),
69 | Row(11, "Metadata", null, null),
70 | Row(null, null, null, null),
71 | Row(13, null, "1", "2"),
72 | Row(14, "A", "1", "2"),
73 | Row(15, "B", "4", "5"),
74 | Row(16, "C", "7", "8")
75 | ).asJava
76 |
77 | val expectedSchema_Projection = StructType(
78 | List(
79 | StructField("3", StringType, true),
80 | StructField("RowID", IntegerType, true),
81 | StructField("2", StringType, true)
82 | )
83 | )
84 |
85 | val expectedData_Projection: util.List[Row] = List(
86 | Row(null, 0, null),
87 | Row("Info", 1, "Info"),
88 | Row(null, 3, null),
89 | Row("2", 5, "1"),
90 | Row("2", 6, "1"),
91 | Row("6", 7, "5"),
92 | Row("10", 8, "9"),
93 | Row(null, 11, null),
94 | Row("2", 13, "1"),
95 | Row("2", 14, "1"),
96 | Row("5", 15, "4"),
97 | Row("8", 16, "7")
98 | ).asJava
99 |
100 | }
101 |
102 | class RowNumberColumnSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
103 | import RowNumberColumnSuite._
104 |
105 | test("read with addition excel row number column") {
106 | val df = readFromResources(
107 | spark,
108 | path = "issue_285_bryce21.xlsx",
109 | Map("header" -> false, "keepUndefinedRows" -> false, "columnNameOfRowNumber" -> "RowID"),
110 | schema = expectedSchema
111 | )
112 | val expected = spark.createDataFrame(expectedData_NoKeep, expectedSchema)
113 | assertDataFrameEquals(expected, df)
114 | }
115 |
116 | test("read with addition excel row number column, keep undefined rows") {
117 | val df = readFromResources(
118 | spark,
119 | path = "/issue_285_bryce21.xlsx",
120 | Map("header" -> false, "keepUndefinedRows" -> true, "columnNameOfRowNumber" -> "RowID"),
121 | schema = expectedSchema
122 | )
123 | val expected = spark.createDataFrame(expectedData_Keep, expectedSchema)
124 | assertDataFrameEquals(expected, df)
125 | }
126 |
127 | test("read with addition excel row number column, projection") {
128 | val df = readFromResources(
129 | spark,
130 | path = "/issue_285_bryce21.xlsx",
131 | Map("header" -> false, "keepUndefinedRows" -> false, "columnNameOfRowNumber" -> "RowID"),
132 | schema = expectedSchema
133 | ).select("3", "RowID", "2")
134 | val expected = spark.createDataFrame(expectedData_Projection, expectedSchema_Projection)
135 | assertDataFrameEquals(expected, df)
136 | }
137 | }
138 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/spark/excel/v2/UserReportedIssuesSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch.spark.excel.v2
18 |
19 | import dev.mauch.spark.DataFrameSuiteBase
20 | import org.apache.spark.sql.Row
21 | import org.apache.spark.sql.types._
22 | import org.scalatest.funsuite.AnyFunSuite
23 |
24 | import java.util
25 | import scala.jdk.CollectionConverters._
26 | import java.sql.Date
27 |
28 | object UserReportedIssuesSuite {
29 |
30 | /** Issue: https://github.dev/mauch/spark-excel/issues/463 Cannot load Date and Decimal fields
31 | */
32 | val userDefined_Issue463 = StructType(
33 | List(
34 | StructField("itm no", StringType, true),
35 | StructField("Expense", DecimalType(23, 10), true),
36 | StructField("Date", DateType, true)
37 | )
38 | )
39 |
40 | val expectedData_Issue463: util.List[Row] =
41 | List(Row("item1", Decimal("1.1"), Date.valueOf("2021-10-01"))).asJava
42 |
43 | }
44 |
45 | class UserReportedIssuesSuite extends AnyFunSuite with DataFrameSuiteBase with ExcelTestingUtilities {
46 | import UserReportedIssuesSuite._
47 |
48 | test("#463 Date and decimal with user defined schema") {
49 | val df = readFromResources(
50 | spark,
51 | path = "issue_463_cristichircu.xlsx",
52 | options = Map("header" -> true),
53 | schema = userDefined_Issue463
54 | )
55 | val expected = spark.createDataFrame(expectedData_Issue463, userDefined_Issue463)
56 | assertDataFrameEquals(expected, df)
57 | }
58 | }
59 |
--------------------------------------------------------------------------------
/src/test/scala/dev/mauch/tags/package.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2022 Martin Mauch (@nightscape)
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package dev.mauch
18 |
19 | import org.scalatest.Tag
20 |
21 | package object tags {
22 | object WIP extends Tag("dev.mauch.tags.WIP")
23 | }
24 |
--------------------------------------------------------------------------------