├── .editorconfig ├── .github ├── ISSUE_TEMPLATE │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── jacoco_check.yml │ └── test.yml ├── .gitignore ├── LICENSE ├── README.md ├── build.sbt ├── project ├── Dependencies.scala ├── build.properties └── plugins.sbt ├── publish.sbt ├── src ├── main │ ├── scala │ │ └── za │ │ │ └── co │ │ │ └── absa │ │ │ └── spark │ │ │ └── hats │ │ │ ├── Extensions.scala │ │ │ ├── transformations │ │ │ ├── ArrayContext.scala │ │ │ └── NestedArrayTransformations.scala │ │ │ └── utils │ │ │ ├── JsonUtils.scala │ │ │ └── SchemaUtils.scala │ ├── scala_2.11 │ │ └── za │ │ │ └── co │ │ │ └── absa │ │ │ └── spark │ │ │ └── hats │ │ │ └── HofsWrapper.scala │ ├── scala_2.12 │ │ └── za │ │ │ └── co │ │ │ └── absa │ │ │ └── spark │ │ │ └── hats │ │ │ └── HofsWrapper.scala │ └── scala_2.13 │ │ └── za │ │ └── co │ │ └── absa │ │ └── spark │ │ └── hats │ │ └── HofsWrapper.scala └── test │ ├── resources │ ├── log4j.properties │ ├── log4j2.properties │ └── test_data │ │ └── nested │ │ ├── nested10Results.json │ │ ├── nested10Schema.txt │ │ ├── nested1Results.json │ │ ├── nested1Schema.txt │ │ ├── nested2Results.json │ │ ├── nested2Schema.txt │ │ ├── nested3Results.json │ │ ├── nested3Schema.txt │ │ ├── nested4Results.json │ │ ├── nested4Schema.txt │ │ ├── nested5Results.json │ │ ├── nested5Schema.txt │ │ ├── nested6Results.json │ │ ├── nested6Schema.txt │ │ ├── nested7Results.json │ │ ├── nested7Schema.txt │ │ ├── nested8Results.json │ │ ├── nested8Schema.txt │ │ ├── nested9Results.json │ │ ├── nested9Schema.txt │ │ └── nestedDf1.json │ └── scala │ └── za │ └── co │ └── absa │ └── spark │ └── hats │ ├── SparkTestBase.scala │ └── transformations │ ├── DeepArrayErrorTransformationSuite.scala │ ├── DeepArrayTransformationSuite.scala │ ├── ExtendedTransformationsSuite.scala │ └── samples │ ├── DeepArraySamples.scala │ ├── ErrorMessage.scala │ ├── NestedMapTestCaseFactory.scala │ ├── NestedTestCaseFactory.scala │ └── SampleErrorUDFs.scala └── version.sbt /.editorconfig: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | # top-most EditorConfig file 17 | root = true 18 | 19 | [*] 20 | charset = utf-8 21 | end_of_line = lf 22 | trim_trailing_whitespace = true 23 | 24 | [*.xml] 25 | indent_size = 4 26 | indent_style = space 27 | insert_final_newline = true 28 | 29 | [*.properties] 30 | insert_final_newline = true 31 | 32 | [*.{java,scala,js,json,css}] 33 | indent_size = 2 34 | indent_style = space 35 | insert_final_newline = true 36 | max_line_length = 120 37 | 38 | [*.md] 39 | trim_trailing_whitespace = false 40 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: '' 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | A clear and concise description of what the bug is. 12 | 13 | **To Reproduce** 14 | Steps to reproduce the behavior: 15 | 1. Go to '...' 16 | 2. Click on '....' 17 | 3. Scroll down to '....' 18 | 4. See error 19 | 20 | **Expected behavior** 21 | A clear and concise description of what you expected to happen. 22 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: '' 5 | labels: enhancement 6 | assignees: '' 7 | 8 | --- 9 | 10 | ## Background 11 | A clear and concise description of where the limitation lies. 12 | 13 | ## Feature 14 | A description of the requested feature. 15 | 16 | ## Example [Optional] 17 | A simple example if applicable. 18 | 19 | ## Proposed Solution [Optional] 20 | Solution Ideas 21 | 1. 22 | 2. 23 | 3. 24 | -------------------------------------------------------------------------------- /.github/workflows/jacoco_check.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2018 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | name: JaCoCo report 17 | 18 | on: 19 | pull_request: 20 | branches: [ master ] 21 | types: [ opened, edited, synchronize, reopened ] 22 | 23 | jobs: 24 | test: 25 | runs-on: ubuntu-latest 26 | strategy: 27 | fail-fast: false 28 | matrix: 29 | include: 30 | - scala: 2.11.12 31 | scalaShort: "2.11" 32 | spark: 2.4.8 33 | overall: 0.0 34 | changed: 80.0 35 | - scala: 2.12.18 36 | scalaShort: "2.12" 37 | spark: 3.2.4 38 | overall: 0.0 39 | changed: 80.0 40 | - scala: 2.13.11 41 | scalaShort: "2.13" 42 | spark: 3.4.1 43 | overall: 0.0 44 | changed: 80.0 45 | name: Check code-coverage by JaCoCo - Spark ${{matrix.spark}} on Scala ${{matrix.scala}} 46 | steps: 47 | - name: Checkout code 48 | uses: actions/checkout@v2 49 | - uses: coursier/cache-action@v5 50 | - name: Setup Scala 51 | uses: olafurpg/setup-scala@v10 52 | with: 53 | java-version: "adopt@1.8" 54 | - name: Build and run tests 55 | run: sbt ++${{matrix.scala}} jacoco -DSPARK_VERSION=${{matrix.spark}} 56 | - name: Add coverage to PR for Scala ${{matrix.scala}} & Spark ${{matrix.spark}} 57 | id: jacoco 58 | uses: madrapps/jacoco-report@v1.3 59 | with: 60 | paths: ${{ github.workspace }}/target/scala-${{ matrix.scalaShort }}/jacoco/report/jacoco.xml 61 | token: ${{ secrets.GITHUB_TOKEN }} 62 | min-coverage-overall: ${{ matrix.overall }} 63 | min-coverage-changed-files: ${{ matrix.changed }} 64 | title: JaCoCo code coverage report - Scala ${{ matrix.scala }} & Spark ${{ matrix.spark }} 65 | update-comment: true 66 | - name: Get the Coverage info 67 | run: | 68 | echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}" 69 | echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}" 70 | - name: Fail PR if changed files coverage is less than ${{ matrix.changed }}% 71 | if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }} 72 | uses: actions/github-script@v6 73 | with: 74 | script: | 75 | core.setFailed('Changed files coverage is less than ${{ matrix.changed }}%!') 76 | 77 | -------------------------------------------------------------------------------- /.github/workflows/test.yml: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | name: Build 17 | 18 | on: 19 | push: 20 | branches: [ master ] 21 | pull_request: 22 | branches: [ master ] 23 | 24 | jobs: 25 | test: 26 | runs-on: ubuntu-latest 27 | strategy: 28 | fail-fast: false 29 | matrix: 30 | scala: [2.11.12, 2.12.18, 2.13.11] 31 | spark: [2.4.8, 3.2.4, 3.4.1] 32 | exclude: 33 | - scala: 2.11.12 34 | spark: 3.2.4 35 | - scala: 2.11.12 36 | spark: 3.4.1 37 | - scala: 2.12.18 38 | spark: 2.4.8 39 | - scala: 2.13.11 40 | spark: 2.4.8 41 | name: Test Spark ${{matrix.spark}} on Scala ${{matrix.scala}} 42 | steps: 43 | - name: Checkout code 44 | uses: actions/checkout@v2 45 | - uses: coursier/cache-action@v5 46 | - name: Setup Scala 47 | uses: olafurpg/setup-scala@v10 48 | with: 49 | java-version: "adopt@1.8" 50 | - name: Build and run tests 51 | run: sbt ++${{matrix.scala}} test -DSPARK_VERSION=${{matrix.spark}} 52 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | # use glob syntax. 17 | syntax: glob 18 | *.ser 19 | *.class 20 | *~ 21 | *.bak 22 | #*.off 23 | *.old 24 | 25 | # eclipse conf file 26 | .settings 27 | .classpath 28 | .project 29 | .manager 30 | .scala_dependencies 31 | 32 | # idea 33 | .idea 34 | *.iml 35 | 36 | # building 37 | target 38 | build 39 | null 40 | tmp* 41 | temp* 42 | dist 43 | test-output 44 | build.log 45 | 46 | # other scm 47 | .svn 48 | .CVS 49 | .hg* 50 | 51 | .cache* 52 | dependency-reduced-pom.xml 53 | 54 | .bsp 55 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | 2 | Apache License 3 | Version 2.0, January 2004 4 | http://www.apache.org/licenses/ 5 | 6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 7 | 8 | 1. Definitions. 9 | 10 | "License" shall mean the terms and conditions for use, reproduction, 11 | and distribution as defined by Sections 1 through 9 of this document. 12 | 13 | "Licensor" shall mean the copyright owner or entity authorized by 14 | the copyright owner that is granting the License. 15 | 16 | "Legal Entity" shall mean the union of the acting entity and all 17 | other entities that control, are controlled by, or are under common 18 | control with that entity. For the purposes of this definition, 19 | "control" means (i) the power, direct or indirect, to cause the 20 | direction or management of such entity, whether by contract or 21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 22 | outstanding shares, or (iii) beneficial ownership of such entity. 23 | 24 | "You" (or "Your") shall mean an individual or Legal Entity 25 | exercising permissions granted by this License. 26 | 27 | "Source" form shall mean the preferred form for making modifications, 28 | including but not limited to software source code, documentation 29 | source, and configuration files. 30 | 31 | "Object" form shall mean any form resulting from mechanical 32 | transformation or translation of a Source form, including but 33 | not limited to compiled object code, generated documentation, 34 | and conversions to other media types. 35 | 36 | "Work" shall mean the work of authorship, whether in Source or 37 | Object form, made available under the License, as indicated by a 38 | copyright notice that is included in or attached to the work 39 | (an example is provided in the Appendix below). 40 | 41 | "Derivative Works" shall mean any work, whether in Source or Object 42 | form, that is based on (or derived from) the Work and for which the 43 | editorial revisions, annotations, elaborations, or other modifications 44 | represent, as a whole, an original work of authorship. For the purposes 45 | of this License, Derivative Works shall not include works that remain 46 | separable from, or merely link (or bind by name) to the interfaces of, 47 | the Work and Derivative Works thereof. 48 | 49 | "Contribution" shall mean any work of authorship, including 50 | the original version of the Work and any modifications or additions 51 | to that Work or Derivative Works thereof, that is intentionally 52 | submitted to Licensor for inclusion in the Work by the copyright owner 53 | or by an individual or Legal Entity authorized to submit on behalf of 54 | the copyright owner. For the purposes of this definition, "submitted" 55 | means any form of electronic, verbal, or written communication sent 56 | to the Licensor or its representatives, including but not limited to 57 | communication on electronic mailing lists, source code control systems, 58 | and issue tracking systems that are managed by, or on behalf of, the 59 | Licensor for the purpose of discussing and improving the Work, but 60 | excluding communication that is conspicuously marked or otherwise 61 | designated in writing by the copyright owner as "Not a Contribution." 62 | 63 | "Contributor" shall mean Licensor and any individual or Legal Entity 64 | on behalf of whom a Contribution has been received by Licensor and 65 | subsequently incorporated within the Work. 66 | 67 | 2. Grant of Copyright License. Subject to the terms and conditions of 68 | this License, each Contributor hereby grants to You a perpetual, 69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 70 | copyright license to reproduce, prepare Derivative Works of, 71 | publicly display, publicly perform, sublicense, and distribute the 72 | Work and such Derivative Works in Source or Object form. 73 | 74 | 3. Grant of Patent License. Subject to the terms and conditions of 75 | this License, each Contributor hereby grants to You a perpetual, 76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 77 | (except as stated in this section) patent license to make, have made, 78 | use, offer to sell, sell, import, and otherwise transfer the Work, 79 | where such license applies only to those patent claims licensable 80 | by such Contributor that are necessarily infringed by their 81 | Contribution(s) alone or by combination of their Contribution(s) 82 | with the Work to which such Contribution(s) was submitted. If You 83 | institute patent litigation against any entity (including a 84 | cross-claim or counterclaim in a lawsuit) alleging that the Work 85 | or a Contribution incorporated within the Work constitutes direct 86 | or contributory patent infringement, then any patent licenses 87 | granted to You under this License for that Work shall terminate 88 | as of the date such litigation is filed. 89 | 90 | 4. Redistribution. You may reproduce and distribute copies of the 91 | Work or Derivative Works thereof in any medium, with or without 92 | modifications, and in Source or Object form, provided that You 93 | meet the following conditions: 94 | 95 | (a) You must give any other recipients of the Work or 96 | Derivative Works a copy of this License; and 97 | 98 | (b) You must cause any modified files to carry prominent notices 99 | stating that You changed the files; and 100 | 101 | (c) You must retain, in the Source form of any Derivative Works 102 | that You distribute, all copyright, patent, trademark, and 103 | attribution notices from the Source form of the Work, 104 | excluding those notices that do not pertain to any part of 105 | the Derivative Works; and 106 | 107 | (d) If the Work includes a "NOTICE" text file as part of its 108 | distribution, then any Derivative Works that You distribute must 109 | include a readable copy of the attribution notices contained 110 | within such NOTICE file, excluding those notices that do not 111 | pertain to any part of the Derivative Works, in at least one 112 | of the following places: within a NOTICE text file distributed 113 | as part of the Derivative Works; within the Source form or 114 | documentation, if provided along with the Derivative Works; or, 115 | within a display generated by the Derivative Works, if and 116 | wherever such third-party notices normally appear. The contents 117 | of the NOTICE file are for informational purposes only and 118 | do not modify the License. You may add Your own attribution 119 | notices within Derivative Works that You distribute, alongside 120 | or as an addendum to the NOTICE text from the Work, provided 121 | that such additional attribution notices cannot be construed 122 | as modifying the License. 123 | 124 | You may add Your own copyright statement to Your modifications and 125 | may provide additional or different license terms and conditions 126 | for use, reproduction, or distribution of Your modifications, or 127 | for any such Derivative Works as a whole, provided Your use, 128 | reproduction, and distribution of the Work otherwise complies with 129 | the conditions stated in this License. 130 | 131 | 5. Submission of Contributions. Unless You explicitly state otherwise, 132 | any Contribution intentionally submitted for inclusion in the Work 133 | by You to the Licensor shall be under the terms and conditions of 134 | this License, without any additional terms or conditions. 135 | Notwithstanding the above, nothing herein shall supersede or modify 136 | the terms of any separate license agreement you may have executed 137 | with Licensor regarding such Contributions. 138 | 139 | 6. Trademarks. This License does not grant permission to use the trade 140 | names, trademarks, service marks, or product names of the Licensor, 141 | except as required for reasonable and customary use in describing the 142 | origin of the Work and reproducing the content of the NOTICE file. 143 | 144 | 7. Disclaimer of Warranty. Unless required by applicable law or 145 | agreed to in writing, Licensor provides the Work (and each 146 | Contributor provides its Contributions) on an "AS IS" BASIS, 147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 148 | implied, including, without limitation, any warranties or conditions 149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 150 | PARTICULAR PURPOSE. You are solely responsible for determining the 151 | appropriateness of using or redistributing the Work and assume any 152 | risks associated with Your exercise of permissions under this License. 153 | 154 | 8. Limitation of Liability. In no event and under no legal theory, 155 | whether in tort (including negligence), contract, or otherwise, 156 | unless required by applicable law (such as deliberate and grossly 157 | negligent acts) or agreed to in writing, shall any Contributor be 158 | liable to You for damages, including any direct, indirect, special, 159 | incidental, or consequential damages of any character arising as a 160 | result of this License or out of the use or inability to use the 161 | Work (including but not limited to damages for loss of goodwill, 162 | work stoppage, computer failure or malfunction, or any and all 163 | other commercial damages or losses), even if such Contributor 164 | has been advised of the possibility of such damages. 165 | 166 | 9. Accepting Warranty or Additional Liability. While redistributing 167 | the Work or Derivative Works thereof, You may choose to offer, 168 | and charge a fee for, acceptance of support, warranty, indemnity, 169 | or other liability obligations and/or rights consistent with this 170 | License. However, in accepting such obligations, You may act only 171 | on Your own behalf and on Your sole responsibility, not on behalf 172 | of any other Contributor, and only if You agree to indemnify, 173 | defend, and hold each Contributor harmless for any liability 174 | incurred by, or claims asserted against, such Contributor by reason 175 | of your accepting any such warranty or additional liability. 176 | 177 | END OF TERMS AND CONDITIONS 178 | 179 | APPENDIX: How to apply the Apache License to your work. 180 | 181 | To apply the Apache License to your work, attach the following 182 | boilerplate notice, with the fields enclosed by brackets "[]" 183 | replaced with your own identifying information. (Don't include 184 | the brackets!) The text should be enclosed in the appropriate 185 | comment syntax for the file format. We also recommend that a 186 | file or class name and description of purpose be included on the 187 | same "printed page" as the copyright notice for easier 188 | identification within third-party archives. 189 | 190 | Copyright [yyyy] [name of copyright owner] 191 | 192 | Licensed under the Apache License, Version 2.0 (the "License"); 193 | you may not use this file except in compliance with the License. 194 | You may obtain a copy of the License at 195 | 196 | http://www.apache.org/licenses/LICENSE-2.0 197 | 198 | Unless required by applicable law or agreed to in writing, software 199 | distributed under the License is distributed on an "AS IS" BASIS, 200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 201 | See the License for the specific language governing permissions and 202 | limitations under the License. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # spark-hats 2 | [![Build](https://github.com/AbsaOSS/spark-hats/workflows/Build/badge.svg)](https://github.com/AbsaOSS/spark-hats/actions) 3 | [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FAbsaOSS%2Fspark-hats.svg?type=shield)](https://app.fossa.com/projects/git%2Bgithub.com%2FAbsaOSS%2Fspark-hats?ref=badge_shield) 4 | 5 | Spark "**H**elpers for **A**rray **T**ransformation**s**" 6 | 7 | This library extends Spark DataFrame API with helpers for transforming fields inside nested structures and arrays of 8 | arbitrary levels of nesting. 9 | 10 | ## Usage 11 | 12 | Reference the library 13 | 14 | 15 | 16 | 17 | 20 | 23 | 26 | 27 | 28 | 31 | 34 | 37 | 38 |
Scala 2.11Scala 2.12Scala 2.13
18 | Maven Central
19 |
21 | Maven Central
22 |
24 | Maven Central
25 |
29 |
groupId: za.co.absa
artifactId: spark-hats_2.11
version: 0.3.0
30 |
32 |
groupId: za.co.absa
artifactId: spark-hats_2.12
version: 0.3.0
33 |
35 |
groupId: za.co.absa
artifactId: spark-hats_2.13
version: 0.3.0
36 |
39 | 40 | Please, use the table below to determine what version of spark-hats to use for Spark compatibility. 41 | 42 | | spark-hats version | Scala version | Spark version | 43 | |:------------------:|:-------------:|:-------------:| 44 | | 0.1.x | 2.11, 2.12 | 2.4.3+ | 45 | | 0.2.x | 2.11, 2.12 | 2.4.3+ | 46 | | 0.2.x | 2.12 | 3.0.0+ | 47 | | 0.3.x | 2.11 | 2.4.3+ | 48 | | 0.3.x | 2.12, 2.13 | 3.2.1+ | 49 | 50 | To use the extensions you need to add this import to your Spark application or shell: 51 | ```scala 52 | import za.co.absa.spark.hats.Extensions._ 53 | ``` 54 | 55 | ### How to generate Code coverage report 56 | ``` 57 | sbt ++{matrix.scala} jacoco -DSPARK_VERSION={matrix.spark} 58 | ``` 59 | Code coverage will be generated on path: 60 | ``` 61 | {project-root}/spark-hats/target/scala-{scala_version}/jacoco/report/html 62 | ``` 63 | 64 | 65 | ## Motivation 66 | 67 | Here is a small example we will use to show you how `spark-hats` work. The important thing is that the dataframe 68 | contains an array of struct fields. 69 | 70 | ```scala 71 | scala> df.printSchema() 72 | root 73 | |-- id: long (nullable = true) 74 | |-- my_array: array (nullable = true) 75 | | |-- element: struct (containsNull = true) 76 | | | |-- a: long (nullable = true) 77 | | | |-- b: string (nullable = true) 78 | 79 | scala> df.show(false) 80 | +---+------------------------------+ 81 | |id |my_array | 82 | +---+------------------------------+ 83 | |1 |[[1, foo]] | 84 | |2 |[[1, bar], [2, baz], [3, foz]]| 85 | +---+------------------------------+ 86 | ``` 87 | 88 | Now, say, we want to add a field `c` as part of the struct alongside `a` and `b` from the example above. The 89 | expression for `c` is `c = a + 1`. 90 | 91 | Here is the code you can use in Spark: 92 | ```scala 93 | val dfOut = df.select(col("id"), transform(col("my_array"), c => { 94 | struct(c.getField("a").as("a"), 95 | c.getField("b").as("b"), 96 | (c.getField("a") + 1).as("c")) 97 | }).as("my_array")) 98 | 99 | ``` 100 | (to use `transform()` in Scala API you need to add [spark-hofs](https://github.com/AbsaOSS/spark-hofs) as a dependency). 101 | 102 | Here is how it looks when using `spark-hats` library. 103 | ```scala 104 | val dfOut = df.nestedMapColumn("my_array.a","c", a => a + 1) 105 | ``` 106 | 107 | Both produce the following results: 108 | ```scala 109 | scala> dfOut.printSchema 110 | root 111 | |-- id: long (nullable = true) 112 | |-- my_array: array (nullable = true) 113 | | |-- element: struct (containsNull = false) 114 | | | |-- a: long (nullable = true) 115 | | | |-- b: string (nullable = true) 116 | | | |-- c: long (nullable = true) 117 | 118 | scala> dfOut.show(false) 119 | +---+---------------------------------------+ 120 | |id |my_array | 121 | +---+---------------------------------------+ 122 | |1 |[[1, foo, 2]] | 123 | |2 |[[1, bar, 2], [2, baz, 3], [3, foz, 4]]| 124 | +---+---------------------------------------+ 125 | ``` 126 | 127 | Imagine how the code will look like for more levels of array nesting. 128 | 129 | ## Methods 130 | 131 | ### Add a column 132 | The `nestedWithColumn` method allows adding new fields inside nested structures and arrays. 133 | 134 | The addition of a column API is provided in two flavors: the basic and the extended API. The basic API is simpler to 135 | use, but the expressions it expects can only reference columns at the root of the schema. Here is an example of the basic add 136 | column API: 137 | 138 | ```scala 139 | scala> df.nestedWithColumn("my_array.c", lit("hello")).printSchema 140 | root 141 | |-- id: long (nullable = true) 142 | |-- my_array: array (nullable = true) 143 | | |-- element: struct (containsNull = false) 144 | | | |-- a: long (nullable = true) 145 | | | |-- b: string (nullable = true) 146 | | | |-- c: string (nullable = false) 147 | 148 | scala> df.nestedWithColumn("my_array.c", lit("hello")).show(false) 149 | +---+---------------------------------------------------+ 150 | |id |my_array | 151 | +---+---------------------------------------------------+ 152 | |1 |[[1, foo, hello]] | 153 | |2 |[[1, bar, hello], [2, baz, hello], [3, foz, hello]]| 154 | +---+---------------------------------------------------+ 155 | ``` 156 | 157 | ### Add column (extended) 158 | The extended API method `nestedWithColumnExtended` works similarly to the basic one but allows the caller to reference 159 | other array elements, possibly on different levels of nesting. The way it allows this is a little tricky. 160 | The second parameter is changed from being a column to a *function that returns a column*. Moreover, this function has 161 | an argument which is a function itself, the `getField()` function. The `getField()` function can be used in the 162 | transformation to reference other columns in the dataframe by their fully qualified name. 163 | 164 | In the following example, a transformation adds a new field `my_array.c` to the dataframe by concatenating a root 165 | level column `id` with a nested field `my_array.b`: 166 | 167 | ```scala 168 | scala> val dfOut = df.nestedWithColumnExtended("my_array.c", getField => 169 | concat(getField("id").cast("string"), getField("my_array.b")) 170 | ) 171 | 172 | scala> dfOut.printSchema 173 | root 174 | |-- id: long (nullable = true) 175 | |-- my_array: array (nullable = true) 176 | | |-- element: struct (containsNull = false) 177 | | | |-- a: long (nullable = true) 178 | | | |-- b: string (nullable = true) 179 | | | |-- c: string (nullable = true) 180 | 181 | scala> dfOut.show(false) 182 | +---+------------------------------------------------+ 183 | |id |my_array | 184 | +---+------------------------------------------------+ 185 | |1 |[[1, foo, 1foo]] | 186 | |2 |[[1, bar, 2bar], [2, baz, 2baz], [3, foz, 2foz]]| 187 | +---+------------------------------------------------+ 188 | ``` 189 | 190 | * **Note.** You can still use `col` to reference root level columns. But if a column is inside an array (like 191 | `my_array.b`), invoking `col("my_array.b")` will reference the whole array, not an individual element. The `getField()` 192 | function that is passed to the transformation solves this by adding a generic way of addressing array elements on arbitrary 193 | levels of nesting. 194 | 195 | * **Advanced Note.** If there are several arrays in the schema, `getField()` allows to reference elements of an array 196 | if it is one of the parents of the output column. 197 | 198 | 199 | ### Drop a column 200 | The `nestedDropColumn` method allows dropping fields inside nested structures and arrays. 201 | 202 | 203 | ```scala 204 | scala> df.nestedDropColumn("my_array.b").printSchema 205 | root 206 | |-- id: long (nullable = true) 207 | |-- my_array: array (nullable = true) 208 | | |-- element: struct (containsNull = false) 209 | | | |-- a: long (nullable = true) 210 | 211 | scala> df.nestedDropColumn("my_array.b").show(false) 212 | +---+---------------+ 213 | |id |my_array | 214 | +---+---------------+ 215 | |1 |[[1]] | 216 | |2 |[[1], [2], [3]]| 217 | +---+---------------+ 218 | ``` 219 | 220 | ### Map a column 221 | 222 | The `nestedMapColumn` method applies a transformation on a nested field. If the input column is a primitive field the 223 | method will add `outputColumnName` at the same level of nesting. If a struct column is expected you can use 224 | `.getField(...)` method to operate on its children. 225 | 226 | The output column name can omit the full path as the field will be created at the same level of nesting as the input column. 227 | 228 | ```scala 229 | scala> df.nestedMapColumn(inputColumnName = "my_array.a", outputColumnName = "c", expression = a => a + 1).printSchema 230 | root 231 | |-- id: long (nullable = true) 232 | |-- my_array: array (nullable = true) 233 | | |-- element: struct (containsNull = false) 234 | | | |-- a: long (nullable = true) 235 | | | |-- b: string (nullable = true) 236 | | | |-- c: long (nullable = true) 237 | 238 | scala> df.nestedMapColumn(inputColumnName = "my_array.a", outputColumnName = "c", expression = a => a + 1).show(false) 239 | +---+---------------------------------------+ 240 | |id |my_array | 241 | +---+---------------------------------------+ 242 | |1 |[[1, foo, 2]] | 243 | |2 |[[1, bar, 2], [2, baz, 3], [3, foz, 4]]| 244 | +---+---------------------------------------+ 245 | ``` 246 | 247 | ## Other transformations 248 | 249 | ### Unstruct 250 | 251 | Syntax: `df.nestedUnstruct("NestedStructColumnName")`. 252 | 253 | Flattens one level of nesting when a struct is nested in another struct. For example, 254 | 255 | ```scala 256 | scala> df.printSchema 257 | root 258 | |-- id: long (nullable = true) 259 | |-- my_array: array (nullable = true) 260 | | |-- element: struct (containsNull = true) 261 | | | |-- a: long (nullable = true) 262 | | | |-- b: string (nullable = true) 263 | | | |-- c: struct (containsNull = true) 264 | | | | |--nestedField1: string (nullable = true) 265 | | | | |--nestedField2: long (nullable = true) 266 | 267 | scala> df.nestedUnstruct("my_array.c").printSchema 268 | root 269 | |-- id: long (nullable = true) 270 | |-- my_array: array (nullable = true) 271 | | |-- element: struct (containsNull = true) 272 | | | |-- a: long (nullable = true) 273 | | | |-- b: string (nullable = true) 274 | | | |-- nestedField1: string (nullable = true) 275 | | | |-- nestedField2: long (nullable = true) 276 | ``` 277 | 278 | Note that the output schema doesn't have the `c` struct. All fields of `c` are now part of the parent struct. 279 | 280 | ## Changelog 281 | - #### 0.3.0 released 3 August 2023. 282 | - [#38](https://github.com/AbsaOSS/spark-hats/issues/38) Add scala 2.13 support. 283 | - [#33](https://github.com/AbsaOSS/spark-hats/issues/33) Update spark test to 3.2.1. 284 | - [#35](https://github.com/AbsaOSS/spark-hats/issues/35) Add code coverage support. 285 | 286 | - #### 0.2.2 released 8 March 2021. 287 | - [#23](https://github.com/AbsaOSS/spark-hats/issues/23) Added `nestedUnstruct()` method that flattens one level of nesting for a given struct. 288 | 289 | - #### 0.2.1 released 21 January 2020. 290 | - [#10](https://github.com/AbsaOSS/spark-hats/issues/10) Fixed error column aggregation when the input array is `null`. 291 | 292 | - #### 0.2.0 released 16 January 2020. 293 | - [#5](https://github.com/AbsaOSS/spark-hats/issues/5) Added the extended nested transformation API that allows referencing arbitrary columns. 294 | 295 | 296 | ## License 297 | [![FOSSA Status](https://app.fossa.com/api/projects/git%2Bgithub.com%2FAbsaOSS%2Fspark-hats.svg?type=large)](https://app.fossa.com/projects/git%2Bgithub.com%2FAbsaOSS%2Fspark-hats?ref=badge_large) -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import Dependencies._ 18 | 19 | val scala211 = "2.11.12" 20 | val scala212 = "2.12.18" 21 | val scala213 = "2.13.11" 22 | 23 | ThisBuild / organization := "za.co.absa" 24 | 25 | ThisBuild / scalaVersion := scala212 26 | ThisBuild / crossScalaVersions := Seq(scala211, scala212, scala213) 27 | 28 | ThisBuild / scalacOptions := Seq("-unchecked", "-deprecation") 29 | 30 | // Scala shouldn't be packaged so it is explicitly added as a provided dependency below 31 | ThisBuild / autoScalaLibrary := false 32 | 33 | lazy val printSparkVersion = taskKey[Unit]("Print Spark version spark-cobol is building against.") 34 | 35 | lazy val hats = (project in file(".")) 36 | .settings( 37 | name := "spark-hats", 38 | printSparkVersion := { 39 | val log = streams.value.log 40 | val effectiveSparkVersion = sparkVersion(scalaVersion.value) 41 | log.info(s"Building with Spark $effectiveSparkVersion") 42 | effectiveSparkVersion 43 | }, 44 | Compile / compile := ((Compile / compile) dependsOn printSparkVersion).value, 45 | Compile / unmanagedSourceDirectories += { 46 | val sourceDir = (Compile / sourceDirectory).value 47 | CrossVersion.partialVersion(scalaVersion.value) match { 48 | case Some((2, n)) if n == 11 => sourceDir / "scala_2.11" 49 | case Some((2, n)) if n == 12 => sourceDir / "scala_2.12" 50 | case Some((2, n)) if n == 13 => sourceDir / "scala_2.13" 51 | case _ => throw new RuntimeException("Unsupported Scala version") 52 | } 53 | }, 54 | libraryDependencies ++= getSparkHatsDependencies(scalaVersion.value) ++ getHofsDependency(scalaVersion.value) :+ getScalaDependency(scalaVersion.value), 55 | releasePublishArtifactsAction := PgpKeys.publishSigned.value, 56 | Test / fork := true 57 | ).enablePlugins(AutomateHeaderPlugin) 58 | 59 | // release settings 60 | releaseCrossBuild := true 61 | addCommandAlias("releaseNow", ";set releaseVersionBump := sbtrelease.Version.Bump.Bugfix; release with-defaults") 62 | 63 | // JaCoCo code coverage 64 | Test / jacocoReportSettings := JacocoReportSettings( 65 | title = s"spark-hats Jacoco Report - ${scalaVersion.value}", 66 | formats = Seq(JacocoReportFormats.HTML, JacocoReportFormats.XML) 67 | ) 68 | 69 | // exclude example 70 | Test / jacocoExcludes := Seq( 71 | // "za.co.absa.spark.hats.transformations.NestedArrayTransformation*", // class and related objects 72 | // "za.co.absa.spark.hats.transformations.ArrayContext" // class only 73 | ) 74 | -------------------------------------------------------------------------------- /project/Dependencies.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | import sbt._ 18 | 19 | object Dependencies { 20 | 21 | val defaultSparkVersionForScala211 = "2.4.8" 22 | val defaultSparkVersionForScala212 = "3.3.2" 23 | val defaultSparkVersionForScala213 = "3.4.1" 24 | 25 | private val sparkHofsVersion = "0.4.0" 26 | private val scalatestVersion = "3.2.14" 27 | 28 | def getScalaDependency(scalaVersion: String): ModuleID = "org.scala-lang" % "scala-library" % scalaVersion % Provided 29 | 30 | def getSparkHatsDependencies(scalaVersion: String): Seq[ModuleID] = Seq( 31 | // provided 32 | "org.apache.spark" %% "spark-core" % sparkVersion(scalaVersion) % Provided, 33 | "org.apache.spark" %% "spark-sql" % sparkVersion(scalaVersion) % Provided, 34 | "org.apache.spark" %% "spark-catalyst" % sparkVersion(scalaVersion) % Provided, 35 | 36 | // test 37 | "org.scalatest" %% "scalatest" % scalatestVersion % Test 38 | ) 39 | 40 | def getHofsDependency(scalaVersion: String): Seq[ModuleID] = if (scalaVersion.startsWith("2.11.")) { 41 | Seq("za.co.absa" %% "spark-hofs" % sparkHofsVersion) 42 | } else { 43 | Seq.empty 44 | } 45 | 46 | def sparkVersion(scalaVersion: String): String = sys.props.getOrElse("SPARK_VERSION", sparkFallbackVersion(scalaVersion)) 47 | 48 | def sparkFallbackVersion(scalaVersion: String): String = { 49 | if (scalaVersion.startsWith("2.11.")) { 50 | defaultSparkVersionForScala211 51 | } else if (scalaVersion.startsWith("2.12.")) { 52 | defaultSparkVersionForScala212 53 | } else if (scalaVersion.startsWith("2.13.")) { 54 | defaultSparkVersionForScala213 55 | } else { 56 | throw new IllegalArgumentException(s"Scala $scalaVersion not supported.") 57 | } 58 | } 59 | } 60 | -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | # 2 | # Copyright 2019 ABSA Group Limited 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | # 15 | 16 | sbt.version=1.9.2 17 | -------------------------------------------------------------------------------- /project/plugins.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2019 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.2.1") 18 | addSbtPlugin("com.github.sbt" % "sbt-release" % "1.1.0") 19 | addSbtPlugin("de.heikoseeberger" % "sbt-header" % "5.7.0") 20 | 21 | // sbt-jacoco - workaround related dependencies required to download 22 | lazy val ow2Version = "9.5" 23 | lazy val jacocoVersion = "0.8.10-absa.1" 24 | 25 | def jacocoUrl(artifactName: String): String = s"https://github.com/AbsaOSS/jacoco/releases/download/$jacocoVersion/org.jacoco.$artifactName-$jacocoVersion.jar" 26 | def ow2Url(artifactName: String): String = s"https://repo1.maven.org/maven2/org/ow2/asm/$artifactName/$ow2Version/$artifactName-$ow2Version.jar" 27 | 28 | addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.11/2.0/scala-arm_2.11-2.0.jar") 29 | addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.12/2.0/scala-arm_2.12-2.0.jar") 30 | 31 | addSbtPlugin("za.co.absa.jacoco" % "report" % jacocoVersion from jacocoUrl("report")) 32 | addSbtPlugin("za.co.absa.jacoco" % "core" % jacocoVersion from jacocoUrl("core")) 33 | addSbtPlugin("za.co.absa.jacoco" % "agent" % jacocoVersion from jacocoUrl("agent")) 34 | addSbtPlugin("org.ow2.asm" % "asm" % ow2Version from ow2Url("asm")) 35 | addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons")) 36 | addSbtPlugin("org.ow2.asm" % "asm-tree" % ow2Version from ow2Url("asm-tree")) 37 | 38 | addSbtPlugin("za.co.absa.sbt" % "sbt-jacoco" % "3.4.1-absa.3" from "https://github.com/AbsaOSS/sbt-jacoco/releases/download/3.4.1-absa.3/sbt-jacoco-3.4.1-absa.3.jar") 39 | -------------------------------------------------------------------------------- /publish.sbt: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | ThisBuild / organizationName := "ABSA Group Limited" 18 | ThisBuild / organizationHomepage := Some(url("https://www.absa.africa")) 19 | ThisBuild / scmInfo := Some( 20 | ScmInfo( 21 | browseUrl = url("https://github.com/AbsaOSS/spark-hats/tree/master"), 22 | connection = "scm:git:ssh://github.com/AbsaOSS/spark-hats.git", 23 | devConnection = "scm:git:ssh://github.com/AbsaOSS/spark-hats.git" 24 | ) 25 | ) 26 | 27 | ThisBuild / developers := List( 28 | Developer( 29 | id = "yruslan", 30 | name = "Ruslan Iushchenko", 31 | email = "ruslan.iushchenko@absa.africa", 32 | url = url("https://github.com/yruslan") 33 | ) 34 | ) 35 | 36 | ThisBuild / homepage := Some(url("https://github.com/AbsaOSS/spark-hats")) 37 | ThisBuild / description := "Spark extensions for working with nested arrays and structs" 38 | ThisBuild / startYear := Some(2020) 39 | ThisBuild / licenses += "Apache-2.0" -> url("https://www.apache.org/licenses/LICENSE-2.0.txt") 40 | 41 | ThisBuild / pomIncludeRepository := { _ => false } 42 | ThisBuild / publishTo := { 43 | val nexus = "https://oss.sonatype.org/" 44 | if (isSnapshot.value) { 45 | Some("snapshots" at s"${nexus}content/repositories/snapshots") 46 | } else { 47 | Some("releases" at s"${nexus}service/local/staging/deploy/maven2") 48 | } 49 | } 50 | ThisBuild / publishMavenStyle := true 51 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/spark/hats/Extensions.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats 18 | 19 | import org.apache.spark.sql.types.StructType 20 | import org.apache.spark.sql.{Column, DataFrame, Dataset, Row} 21 | import za.co.absa.spark.hats.transformations.NestedArrayTransformations 22 | 23 | /** 24 | * The object is a container of extension methods for Spark DataFrames. 25 | */ 26 | object Extensions { 27 | 28 | type TransformFunction = Column => Column 29 | 30 | type ExtendedTransformFunction = (String => Column) => Column 31 | 32 | /** 33 | * The class represents an extension wrapper for an [[org.apache.spark.sql.DataFrame]]. 34 | * 35 | * @param dataset A data frame to be extended with methods contained in this class. 36 | */ 37 | implicit class DataFrameExtension(dataset: Dataset[Row]) { 38 | 39 | /** 40 | * Add a column that can be inside nested structs, arrays and its combinations. 41 | * 42 | * @param newColumnName A column name to be created 43 | * @param expression A new column value 44 | * @return A dataframe with a new field that contains transformed values. 45 | */ 46 | def nestedWithColumn(newColumnName: String, 47 | expression: Column): Dataset[Row] = { 48 | NestedArrayTransformations.nestedAddColumn(dataset, newColumnName, expression) 49 | } 50 | 51 | /** 52 | * Add a column that can be inside nested structs, arrays and its combinations. 53 | * 54 | * @param newColumnName A column name to be created 55 | * @param expression A new column value 56 | * @return A dataframe with a new field that contains transformed values. 57 | */ 58 | def nestedWithColumnExtended(newColumnName: String, 59 | expression: ExtendedTransformFunction): Dataset[Row] = { 60 | NestedArrayTransformations.nestedAddColumnExtended(dataset, newColumnName, 61 | (_, getFieldFunction) => expression(getFieldFunction)) 62 | } 63 | 64 | /** 65 | * Drop a column from inside a nested structs, arrays and its combinations 66 | * 67 | * @param columnToDrop A column name to be dropped 68 | * @return A dataframe with a new field that contains transformed values. 69 | */ 70 | def nestedDropColumn(columnToDrop: String): DataFrame = { 71 | NestedArrayTransformations.nestedDropColumn(dataset, columnToDrop) 72 | } 73 | 74 | /** 75 | * Map transformation for columns that can be inside nested structs, arrays and its combinations. 76 | * 77 | * If the input column is a primitive field the method will add outputColumnName at the same level of nesting 78 | * by executing the `expression` passing the source column into it. If a struct column is expected you can 79 | * use `.getField(...)` method to operate on its children. 80 | * 81 | * The output column name can omit the full path as the field will be created at the same level of nesting as the input column. 82 | * 83 | * @param inputColumnName A column name for which to apply the transformation, e.g. `company.employee.firstName`. 84 | * @param outputColumnName The output column name. The path is optional, e.g. you can use `conformedName` instead of `company.employee.conformedName`. 85 | * @param expression A function that applies a transformation to a column as a Spark expression. 86 | * @return A dataframe with a new field that contains transformed values. 87 | */ 88 | def nestedMapColumn(inputColumnName: String, 89 | outputColumnName: String, 90 | expression: TransformFunction): DataFrame = { 91 | NestedArrayTransformations.nestedWithColumnMap(dataset, inputColumnName, outputColumnName, expression) 92 | } 93 | 94 | /** 95 | * Moves all fields of the specified struct up one level. This can only be envoked on a struct inside other struct 96 | * 97 | * {{{ 98 | * root 99 | * |-- a: struct 100 | * | |-- b: struct 101 | * | | |-- c: string 102 | * | | |-- d: string 103 | * 104 | * df.nestedUnstruct("a.b") 105 | * 106 | * root 107 | * |-- a: struct 108 | * | |-- c: string 109 | * | |-- d: string 110 | * }}} 111 | * 112 | * 113 | * @param inputColumnName A struct column name that contains the fields to extract. 114 | * @return A dataframe with the struct removed and its fields are up one level. 115 | */ 116 | def nestedUnstruct(inputColumnName: String): DataFrame = { 117 | NestedArrayTransformations.nestedUnstruct(dataset, inputColumnName) 118 | } 119 | 120 | } 121 | 122 | } 123 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/spark/hats/transformations/ArrayContext.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations 18 | 19 | import org.apache.spark.sql.Column 20 | import org.apache.spark.sql.functions.col 21 | import za.co.absa.spark.hats.transformations.NestedArrayTransformations.splitByDeepestParent 22 | 23 | /** 24 | * The class provides a storage for array transformation context for a transformation of a dataframe field. 25 | * The context contains all arrays in the path of the field and their corresponding array element lambda variables 26 | * provided by 'transform()' function of Spark SQL. 27 | */ 28 | private[transformations] 29 | class ArrayContext(val arrayPaths: Seq[String] = Array[String](), 30 | val lambdaVars: Seq[Column] = Array[Column]()) { 31 | 32 | /** 33 | * Returns a new context by appending the current context with a new array/lambda combination. 34 | * 35 | * @param arrayPath A fully-qualified array field name. 36 | * @param lambdaVar A lambda variable of the array element provided by 'transform()' function of Spark SQL. 37 | * @return A column that corresponds to the field name. 38 | */ 39 | def withArraysUpdated(arrayPath: String, lambdaVar: Column): ArrayContext = { 40 | new ArrayContext(arrayPaths :+ arrayPath, lambdaVars :+ lambdaVar) 41 | } 42 | 43 | /** 44 | * Returns an instance of Column that corresponds to the input field's level of array nesting. 45 | * 46 | * @param fieldName A fully-qualified field name. 47 | * @return A column that corresponds to the field name. 48 | */ 49 | def getField(fieldName: String): Column = { 50 | val (parentArray, childField) = splitByDeepestParent(fieldName, arrayPaths) 51 | if (parentArray.isEmpty) { 52 | col(childField) 53 | } else { 54 | val i = arrayPaths.indexOf(parentArray) 55 | if (fieldName == arrayPaths(i)) { 56 | // If the array itself is specified - return the array 57 | lambdaVars(i) 58 | } else { 59 | // If a field inside an array is specified - return the field 60 | // by using '.getField()' on each child (which could be a nested struct) 61 | childField.split('.') 62 | .foldLeft(lambdaVars(i))((parent, column) => parent.getField(column)) 63 | } 64 | } 65 | } 66 | } 67 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/spark/hats/utils/JsonUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.utils 18 | 19 | import com.fasterxml.jackson.databind.ObjectMapper 20 | import org.apache.spark.sql.{DataFrame, SparkSession} 21 | 22 | object JsonUtils { 23 | 24 | /** 25 | * Formats a JSON string so it looks pretty. 26 | * 27 | * @param jsonIn A JSON string 28 | * @return A pretty formatted JSON string 29 | */ 30 | def prettyJSON(jsonIn: String): String = { 31 | val mapper = new ObjectMapper() 32 | 33 | val jsonUnindented = mapper.readValue(jsonIn, classOf[Any]) 34 | val indented = mapper.writerWithDefaultPrettyPrinter.writeValueAsString(jsonUnindented) 35 | indented.replace("\r\n", "\n") 36 | } 37 | 38 | /** 39 | * Formats a Spark-generated JSON strings that are returned by 40 | * applying `.toJSON.collect()` to a DataFrame. 41 | * 42 | * @param jsons A list of JSON documents 43 | * @return A pretty formatted JSON string 44 | */ 45 | def prettySparkJSON(jsons: Seq[String]): String = { 46 | //val properJson = "[" + "}\n".r.replaceAllIn(jsonIn, "},\n") + "]" 47 | val singleJSON = jsons.mkString("[", ",", "]") 48 | prettyJSON(singleJSON) 49 | } 50 | 51 | /** 52 | * Creates a Spark DataFrame from a JSON document(s). 53 | * 54 | * @param json A json string to convert to a DataFrame 55 | * @return A data frame 56 | */ 57 | def getDataFrameFromJson(spark: SparkSession, json: Seq[String]): DataFrame = { 58 | import spark.implicits._ 59 | spark.read.json(json.toDS) 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /src/main/scala/za/co/absa/spark/hats/utils/SchemaUtils.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.utils 18 | 19 | import org.apache.spark.sql.types._ 20 | 21 | import scala.annotation.tailrec 22 | import scala.util.Random 23 | 24 | object SchemaUtils { 25 | 26 | /** 27 | * For an array of arrays of arrays, ... get the final element type at the bottom of the array 28 | * 29 | * @param arrayType An array data type from a Spark dataframe schema 30 | * @return A non-array data type at the bottom of array nesting 31 | */ 32 | @tailrec 33 | def getDeepestArrayType(arrayType: ArrayType): DataType = { 34 | arrayType.elementType match { 35 | case a: ArrayType => getDeepestArrayType(a) 36 | case b => b 37 | } 38 | } 39 | 40 | /** 41 | * Generate a unique column name 42 | * 43 | * @param prefix A prefix to use for the column name 44 | * @param schema An optional schema to validate if the column already exists (a very low probability) 45 | * @return A name that can be used as a unique column name 46 | */ 47 | def getUniqueName(prefix: String, schema: Option[StructType]): String = { 48 | schema match { 49 | case None => 50 | s"${prefix}_${Random.nextLong().abs}" 51 | case Some(sch) => 52 | var exists = true 53 | var columnName = "" 54 | while (exists) { 55 | columnName = s"${prefix}_${Random.nextLong().abs}" 56 | exists = sch.fields.exists(_.name.compareToIgnoreCase(columnName) == 0) 57 | } 58 | columnName 59 | } 60 | } 61 | 62 | } 63 | -------------------------------------------------------------------------------- /src/main/scala_2.11/za/co/absa/spark/hats/HofsWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats 18 | 19 | import org.apache.spark.sql.Column 20 | 21 | import za.co.absa.spark.hofs.{transform => hofsTransform} 22 | 23 | /** 24 | * This is a wrapper for high order functions depending on Scala version. 25 | * 26 | * This implementation uses Hofs(https://github.com/AbsaOSS/spark-hofs). 27 | */ 28 | object HofsWrapper { 29 | /** 30 | * Applies the function `f` to every element in the `array`. The method is an equivalent to the `map` function 31 | * from functional programming. 32 | * 33 | * @param array A column of arrays 34 | * @param f A function transforming individual elements of the array 35 | * @param elementName The name of the lambda variable. The value is used in Spark execution plans. 36 | * @return A column of arrays with transformed elements 37 | */ 38 | def transform( 39 | array: Column, 40 | f: Column => Column, 41 | elementName: String): Column = { 42 | hofsTransform(array, f, elementName) 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/scala_2.12/za/co/absa/spark/hats/HofsWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats 18 | 19 | import org.apache.spark.sql.Column 20 | import org.apache.spark.sql.functions.{transform => sparkTransform} 21 | 22 | /** 23 | * This is a wrapper for high order functions depending on Scala version. 24 | * 25 | * This implementation uses native Spark transform(). 26 | */ 27 | object HofsWrapper { 28 | /** 29 | * Applies the function `f` to every element in the `array`. The method is an equivalent to the `map` function 30 | * from functional programming. 31 | * 32 | * @param array A column of arrays 33 | * @param f A function transforming individual elements of the array 34 | * @param elementName The name of the lambda variable. The value is used in Spark execution plans. 35 | * @return A column of arrays with transformed elements 36 | */ 37 | def transform( 38 | array: Column, 39 | f: Column => Column, 40 | elementName: String): Column = { 41 | sparkTransform(array, f) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/main/scala_2.13/za/co/absa/spark/hats/HofsWrapper.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats 18 | 19 | import org.apache.spark.sql.Column 20 | import org.apache.spark.sql.functions.{transform => sparkTransform} 21 | 22 | /** 23 | * This is a wrapper for high order functions depending on Scala version. 24 | * 25 | * This implementation uses native Spark transform(). 26 | */ 27 | object HofsWrapper { 28 | /** 29 | * Applies the function `f` to every element in the `array`. The method is an equivalent to the `map` function 30 | * from functional programming. 31 | * 32 | * @param array A column of arrays 33 | * @param f A function transforming individual elements of the array 34 | * @param elementName The name of the lambda variable. The value is used in Spark execution plans. 35 | * @return A column of arrays with transformed elements 36 | */ 37 | def transform( 38 | array: Column, 39 | f: Column => Column, 40 | elementName: String): Column = { 41 | sparkTransform(array, f) 42 | } 43 | } 44 | -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2020 ABSA Group Limited 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | log4j.rootCategory=INFO, console 15 | log4j.appender.console=org.apache.log4j.ConsoleAppender 16 | log4j.appender.console.target=System.err 17 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 19 | log4j.appender.console.Threshold=ERROR 20 | -------------------------------------------------------------------------------- /src/test/resources/log4j2.properties: -------------------------------------------------------------------------------- 1 | # Copyright 2020 ABSA Group Limited 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # http://www.apache.org/licenses/LICENSE-2.0 7 | # 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | 14 | log4j.rootCategory=INFO, console 15 | log4j.appender.console=org.apache.log4j.ConsoleAppender 16 | log4j.appender.console.target=System.err 17 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 18 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n 19 | log4j.appender.console.Threshold=ERROR 20 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested10Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "struct3" : { 3 | "inner3" : { 4 | "array3" : [ { 5 | "a1" : 3, 6 | "a2" : 1, 7 | "a3" : "1", 8 | "out" : "3 1" 9 | }, { 10 | "a1" : 4, 11 | "a2" : 2, 12 | "a3" : "5", 13 | "out" : "4 2" 14 | } ] 15 | } 16 | }, 17 | "errCol" : [ { 18 | "errType" : "Initial", 19 | "errCode" : "000", 20 | "errMsg" : "ErrMsg", 21 | "errCol" : "id", 22 | "rawValues" : [ ], 23 | "mappings" : [ ] 24 | }, { 25 | "errType" : "confCastError", 26 | "errCode" : "E00003", 27 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 28 | "errCol" : "a1!==3", 29 | "rawValues" : [ "4" ], 30 | "mappings" : [ ] 31 | } ] 32 | }, { 33 | "struct3" : { 34 | "inner3" : { 35 | "array3" : [ { 36 | "a1" : 4, 37 | "a2" : 2, 38 | "a3" : "3", 39 | "out" : "4 2" 40 | }, { 41 | "a1" : 8, 42 | "a2" : 2, 43 | "a3" : "5", 44 | "out" : "8 2" 45 | } ] 46 | } 47 | }, 48 | "errCol" : [ { 49 | "errType" : "Initial", 50 | "errCode" : "000", 51 | "errMsg" : "ErrMsg", 52 | "errCol" : "id", 53 | "rawValues" : [ ], 54 | "mappings" : [ ] 55 | }, { 56 | "errType" : "confCastError", 57 | "errCode" : "E00003", 58 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 59 | "errCol" : "a1!==3", 60 | "rawValues" : [ "4" ], 61 | "mappings" : [ ] 62 | }, { 63 | "errType" : "confCastError", 64 | "errCode" : "E00003", 65 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 66 | "errCol" : "a1!==3", 67 | "rawValues" : [ "8" ], 68 | "mappings" : [ ] 69 | } ] 70 | }, { 71 | "struct3" : { 72 | "inner3" : { 73 | "array3" : [ { 74 | "a1" : 5, 75 | "a2" : 3, 76 | "a3" : "4", 77 | "out" : "5 3" 78 | }, { 79 | "a1" : 8, 80 | "a2" : 4, 81 | "a3" : "7", 82 | "out" : "8 4" 83 | } ] 84 | } 85 | }, 86 | "errCol" : [ { 87 | "errType" : "Initial", 88 | "errCode" : "000", 89 | "errMsg" : "ErrMsg", 90 | "errCol" : "id", 91 | "rawValues" : [ ], 92 | "mappings" : [ ] 93 | }, { 94 | "errType" : "confCastError", 95 | "errCode" : "E00003", 96 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 97 | "errCol" : "a1!==3", 98 | "rawValues" : [ "5" ], 99 | "mappings" : [ ] 100 | }, { 101 | "errType" : "confCastError", 102 | "errCode" : "E00003", 103 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 104 | "errCol" : "a1!==3", 105 | "rawValues" : [ "8" ], 106 | "mappings" : [ ] 107 | } ] 108 | }, { 109 | "struct3" : { 110 | "inner3" : { 111 | "array3" : [ { 112 | "a1" : 6, 113 | "a2" : 4, 114 | "a3" : "6", 115 | "out" : "6 4" 116 | }, { 117 | "a1" : 9, 118 | "a2" : 3, 119 | "a3" : "7", 120 | "out" : "9 3" 121 | } ] 122 | } 123 | }, 124 | "errCol" : [ { 125 | "errType" : "Initial", 126 | "errCode" : "000", 127 | "errMsg" : "ErrMsg", 128 | "errCol" : "id", 129 | "rawValues" : [ ], 130 | "mappings" : [ ] 131 | }, { 132 | "errType" : "confCastError", 133 | "errCode" : "E00003", 134 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 135 | "errCol" : "a1!==3", 136 | "rawValues" : [ "6" ], 137 | "mappings" : [ ] 138 | }, { 139 | "errType" : "confCastError", 140 | "errCode" : "E00003", 141 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 142 | "errCol" : "a1!==3", 143 | "rawValues" : [ "9" ], 144 | "mappings" : [ ] 145 | } ] 146 | }, { 147 | "struct3" : { 148 | "inner3" : { 149 | "array3" : [ { 150 | "a1" : 7, 151 | "a2" : 5, 152 | "a3" : "7", 153 | "out" : "7 5" 154 | } ] 155 | } 156 | }, 157 | "errCol" : [ { 158 | "errType" : "Initial", 159 | "errCode" : "000", 160 | "errMsg" : "ErrMsg", 161 | "errCol" : "id", 162 | "rawValues" : [ ], 163 | "mappings" : [ ] 164 | }, { 165 | "errType" : "confCastError", 166 | "errCode" : "E00003", 167 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 168 | "errCol" : "a1!==3", 169 | "rawValues" : [ "7" ], 170 | "mappings" : [ ] 171 | } ] 172 | }, { 173 | "struct3" : { 174 | "inner3" : { 175 | "array3" : [ { 176 | "a1" : 4, 177 | "a2" : 6, 178 | "a3" : "5", 179 | "out" : "4 6" 180 | } ] 181 | } 182 | }, 183 | "errCol" : [ { 184 | "errType" : "Initial", 185 | "errCode" : "000", 186 | "errMsg" : "ErrMsg", 187 | "errCol" : "id", 188 | "rawValues" : [ ], 189 | "mappings" : [ ] 190 | }, { 191 | "errType" : "confCastError", 192 | "errCode" : "E00003", 193 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 194 | "errCol" : "a1!==3", 195 | "rawValues" : [ "4" ], 196 | "mappings" : [ ] 197 | } ] 198 | }, { 199 | "struct3" : { 200 | "inner3" : { } 201 | }, 202 | "errCol" : [ { 203 | "errType" : "Initial", 204 | "errCode" : "000", 205 | "errMsg" : "ErrMsg", 206 | "errCol" : "id", 207 | "rawValues" : [ ], 208 | "mappings" : [ ] 209 | } ] 210 | }, { 211 | "struct3" : { 212 | "inner3" : { } 213 | }, 214 | "errCol" : [ { 215 | "errType" : "Initial", 216 | "errCode" : "000", 217 | "errMsg" : "ErrMsg", 218 | "errCol" : "id", 219 | "rawValues" : [ ], 220 | "mappings" : [ ] 221 | } ] 222 | } ] 223 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested10Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- struct3: struct (nullable = false) 3 | | |-- inner3: struct (nullable = false) 4 | | | |-- array3: array (nullable = true) 5 | | | | |-- element: struct (containsNull = false) 6 | | | | | |-- a1: long (nullable = true) 7 | | | | | |-- a2: long (nullable = true) 8 | | | | | |-- a3: string (nullable = true) 9 | | | | | |-- out: string (nullable = true) 10 | |-- errCol: array (nullable = true) 11 | | |-- element: struct (containsNull = true) 12 | | | |-- errType: string (nullable = true) 13 | | | |-- errCode: string (nullable = true) 14 | | | |-- errMsg: string (nullable = true) 15 | | | |-- errCol: string (nullable = true) 16 | | | |-- rawValues: array (nullable = true) 17 | | | | |-- element: string (containsNull = true) 18 | | | |-- mappings: array (nullable = true) 19 | | | | |-- element: struct (containsNull = true) 20 | | | | | |-- mappingTableColumn: string (nullable = true) 21 | | | | | |-- mappedDatasetColumn: string (nullable = true) 22 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested1Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "id" : 1, 3 | "key1" : 1, 4 | "key2" : 2, 5 | "struct1" : { 6 | "key3" : 3, 7 | "key4" : 1 8 | }, 9 | "struct2" : { 10 | "inner1" : { 11 | "key5" : 3, 12 | "key6" : 1, 13 | "skey1" : "1" 14 | } 15 | }, 16 | "struct3" : { 17 | "inner3" : { 18 | "array3" : [ { 19 | "a1" : 3, 20 | "a2" : 1, 21 | "a3" : "1" 22 | }, { 23 | "a1" : 4, 24 | "a2" : 2, 25 | "a3" : "5" 26 | } ] 27 | } 28 | }, 29 | "array1" : [ { 30 | "key7" : 2, 31 | "key8" : 3, 32 | "skey2" : "1" 33 | }, { 34 | "key7" : 1, 35 | "key8" : 2, 36 | "skey2" : "2" 37 | }, { 38 | "key7" : 3, 39 | "key8" : 3, 40 | "skey2" : "3" 41 | } ], 42 | "array2" : [ { 43 | "key2" : 1, 44 | "inner2" : [ { 45 | "key9" : 1, 46 | "key10" : 2, 47 | "struct3" : { 48 | "k1" : 1, 49 | "k2" : 2 50 | } 51 | }, { 52 | "key9" : 1, 53 | "key10" : 2, 54 | "struct3" : { 55 | "k1" : 2, 56 | "k2" : 2 57 | } 58 | } ] 59 | }, { 60 | "key2" : 2, 61 | "inner2" : [ { 62 | "key9" : 3, 63 | "key10" : 1, 64 | "struct3" : { 65 | "k1" : 1, 66 | "k2" : 2 67 | } 68 | }, { 69 | "key9" : 2, 70 | "key10" : 2, 71 | "struct3" : { 72 | "k1" : 3, 73 | "k2" : 3 74 | } 75 | } ] 76 | } ], 77 | "id_str" : "1 1 2" 78 | }, { 79 | "id" : 2, 80 | "key1" : 2, 81 | "key2" : 1, 82 | "struct1" : { 83 | "key3" : 2, 84 | "key4" : 3 85 | }, 86 | "struct2" : { 87 | "inner1" : { 88 | "key5" : 2, 89 | "key6" : 3, 90 | "skey1" : "2" 91 | } 92 | }, 93 | "struct3" : { 94 | "inner3" : { 95 | "array3" : [ { 96 | "a1" : 4, 97 | "a2" : 2, 98 | "a3" : "3" 99 | }, { 100 | "a1" : 8, 101 | "a2" : 2, 102 | "a3" : "5" 103 | } ] 104 | } 105 | }, 106 | "array1" : [ { 107 | "key7" : 4, 108 | "key8" : 2, 109 | "skey2" : "2" 110 | }, { 111 | "key7" : 3, 112 | "key8" : 1, 113 | "skey2" : "3" 114 | }, { 115 | "key7" : 3, 116 | "key8" : 3, 117 | "skey2" : "3" 118 | } ], 119 | "array2" : [ { 120 | "key2" : 2, 121 | "inner2" : [ { 122 | "key9" : 1, 123 | "key10" : 2, 124 | "struct3" : { 125 | "k1" : 1, 126 | "k2" : 1 127 | } 128 | }, { 129 | "key9" : 1, 130 | "key10" : 2, 131 | "struct3" : { 132 | "k1" : 1, 133 | "k2" : 1 134 | } 135 | } ] 136 | }, { 137 | "key2" : 3, 138 | "inner2" : [ { 139 | "key9" : 3, 140 | "key10" : 1, 141 | "struct3" : { 142 | "k1" : 2, 143 | "k2" : 1 144 | } 145 | }, { 146 | "key9" : 4, 147 | "key10" : 1, 148 | "struct3" : { 149 | "k1" : 3, 150 | "k2" : 3 151 | } 152 | } ] 153 | } ], 154 | "id_str" : "2 2 1" 155 | }, { 156 | "id" : 3, 157 | "key1" : 3, 158 | "key2" : 2, 159 | "struct1" : { 160 | "key3" : 1, 161 | "key4" : 2 162 | }, 163 | "struct2" : { 164 | "inner1" : { 165 | "key5" : 1, 166 | "key6" : 2, 167 | "skey1" : "3" 168 | } 169 | }, 170 | "struct3" : { 171 | "inner3" : { 172 | "array3" : [ { 173 | "a1" : 5, 174 | "a2" : 3, 175 | "a3" : "4" 176 | }, { 177 | "a1" : 8, 178 | "a2" : 4, 179 | "a3" : "7" 180 | } ] 181 | } 182 | }, 183 | "array1" : [ ], 184 | "array2" : [ { 185 | "key2" : 3, 186 | "inner2" : [ { 187 | "key9" : 2, 188 | "key10" : 3, 189 | "struct3" : { 190 | "k1" : 2, 191 | "k2" : 2 192 | } 193 | }, { 194 | "key9" : 2, 195 | "key10" : 1 196 | } ] 197 | }, { 198 | "key2" : 2, 199 | "inner2" : [ { 200 | "key9" : 3, 201 | "key10" : 1, 202 | "struct3" : { 203 | "k1" : 1, 204 | "k2" : 2 205 | } 206 | }, { 207 | "key9" : 2, 208 | "key10" : 1, 209 | "struct3" : { 210 | "k1" : 1, 211 | "k2" : 1 212 | } 213 | } ] 214 | } ], 215 | "id_str" : "3 3 2" 216 | }, { 217 | "id" : 4, 218 | "key1" : 2, 219 | "key2" : 3, 220 | "struct1" : { 221 | "key3" : 2, 222 | "key4" : 1 223 | }, 224 | "struct2" : { 225 | "inner1" : { 226 | "key5" : 3, 227 | "key6" : 2, 228 | "skey1" : "2" 229 | } 230 | }, 231 | "struct3" : { 232 | "inner3" : { 233 | "array3" : [ { 234 | "a1" : 6, 235 | "a2" : 4, 236 | "a3" : "6" 237 | }, { 238 | "a1" : 9, 239 | "a2" : 3, 240 | "a3" : "7" 241 | } ] 242 | } 243 | }, 244 | "array1" : [ ], 245 | "array2" : [ { 246 | "key2" : 4, 247 | "inner2" : [ ] 248 | }, { 249 | "key2" : 1, 250 | "inner2" : [ { 251 | "key9" : 2, 252 | "key10" : 2, 253 | "struct3" : { 254 | "k1" : 1, 255 | "k2" : 1 256 | } 257 | } ] 258 | } ], 259 | "id_str" : "4 2 3" 260 | }, { 261 | "id" : 5, 262 | "key1" : 4, 263 | "key2" : 1, 264 | "struct1" : { 265 | "key3" : 3, 266 | "key4" : 3 267 | }, 268 | "struct2" : { 269 | "inner1" : { 270 | "key5" : 2, 271 | "key6" : 1, 272 | "skey1" : "3" 273 | } 274 | }, 275 | "struct3" : { 276 | "inner3" : { 277 | "array3" : [ { 278 | "a1" : 7, 279 | "a2" : 5, 280 | "a3" : "7" 281 | } ] 282 | } 283 | }, 284 | "array1" : [ ], 285 | "array2" : [ ], 286 | "id_str" : "5 4 1" 287 | }, { 288 | "id" : 6, 289 | "key1" : 1, 290 | "key2" : 3, 291 | "struct1" : { 292 | "key3" : 1, 293 | "key4" : 2 294 | }, 295 | "struct2" : { 296 | "inner1" : { 297 | "key5" : 1, 298 | "key6" : 2, 299 | "skey1" : "4" 300 | } 301 | }, 302 | "struct3" : { 303 | "inner3" : { 304 | "array3" : [ { 305 | "a1" : 4, 306 | "a2" : 6, 307 | "a3" : "5" 308 | } ] 309 | } 310 | }, 311 | "array1" : [ ], 312 | "array2" : [ ], 313 | "id_str" : "6 1 3" 314 | }, { 315 | "id" : 7, 316 | "key1" : 1, 317 | "key2" : 3, 318 | "struct1" : { 319 | "key3" : 1, 320 | "key4" : 2 321 | }, 322 | "struct2" : { 323 | "inner1" : { 324 | "key5" : 1 325 | } 326 | }, 327 | "array1" : [ ], 328 | "array2" : [ ], 329 | "id_str" : "7 1 3" 330 | }, { 331 | "id" : 8, 332 | "key1" : 1, 333 | "struct1" : { 334 | "key3" : 1 335 | } 336 | } ] 337 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested1Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- id: long (nullable = true) 3 | |-- key1: long (nullable = true) 4 | |-- key2: long (nullable = true) 5 | |-- struct1: struct (nullable = true) 6 | | |-- key3: integer (nullable = true) 7 | | |-- key4: integer (nullable = true) 8 | |-- struct2: struct (nullable = true) 9 | | |-- inner1: struct (nullable = true) 10 | | | |-- key5: long (nullable = true) 11 | | | |-- key6: long (nullable = true) 12 | | | |-- skey1: string (nullable = true) 13 | |-- struct3: struct (nullable = true) 14 | | |-- inner3: struct (nullable = true) 15 | | | |-- array3: array (nullable = true) 16 | | | | |-- element: struct (containsNull = true) 17 | | | | | |-- a1: long (nullable = true) 18 | | | | | |-- a2: long (nullable = true) 19 | | | | | |-- a3: string (nullable = true) 20 | |-- array1: array (nullable = true) 21 | | |-- element: struct (containsNull = true) 22 | | | |-- key7: long (nullable = true) 23 | | | |-- key8: long (nullable = true) 24 | | | |-- skey2: string (nullable = true) 25 | |-- array2: array (nullable = true) 26 | | |-- element: struct (containsNull = true) 27 | | | |-- key2: long (nullable = true) 28 | | | |-- inner2: array (nullable = true) 29 | | | | |-- element: struct (containsNull = true) 30 | | | | | |-- key9: long (nullable = true) 31 | | | | | |-- key10: long (nullable = true) 32 | | | | | |-- struct3: struct (nullable = true) 33 | | | | | | |-- k1: integer (nullable = true) 34 | | | | | | |-- k2: integer (nullable = true) 35 | |-- id_str: string (nullable = true) 36 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested2Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "key1" : 1, 3 | "struct2" : { 4 | "inner1" : { 5 | "key5" : 3, 6 | "key6" : 1, 7 | "skey1" : "1" 8 | }, 9 | "skey2" : "1 3 1" 10 | } 11 | }, { 12 | "key1" : 2, 13 | "struct2" : { 14 | "inner1" : { 15 | "key5" : 2, 16 | "key6" : 3, 17 | "skey1" : "2" 18 | }, 19 | "skey2" : "2 2 3" 20 | } 21 | }, { 22 | "key1" : 3, 23 | "struct2" : { 24 | "inner1" : { 25 | "key5" : 1, 26 | "key6" : 2, 27 | "skey1" : "3" 28 | }, 29 | "skey2" : "3 1 2" 30 | } 31 | }, { 32 | "key1" : 2, 33 | "struct2" : { 34 | "inner1" : { 35 | "key5" : 3, 36 | "key6" : 2, 37 | "skey1" : "2" 38 | }, 39 | "skey2" : "2 3 2" 40 | } 41 | }, { 42 | "key1" : 4, 43 | "struct2" : { 44 | "inner1" : { 45 | "key5" : 2, 46 | "key6" : 1, 47 | "skey1" : "3" 48 | }, 49 | "skey2" : "4 2 1" 50 | } 51 | }, { 52 | "key1" : 1, 53 | "struct2" : { 54 | "inner1" : { 55 | "key5" : 1, 56 | "key6" : 2, 57 | "skey1" : "4" 58 | }, 59 | "skey2" : "1 1 2" 60 | } 61 | }, { 62 | "key1" : 1, 63 | "struct2" : { 64 | "inner1" : { 65 | "key5" : 1 66 | } 67 | } 68 | }, { 69 | "key1" : 1, 70 | "struct2" : { } 71 | } ] 72 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested2Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- key1: long (nullable = true) 3 | |-- struct2: struct (nullable = false) 4 | | |-- inner1: struct (nullable = true) 5 | | | |-- key5: long (nullable = true) 6 | | | |-- key6: long (nullable = true) 7 | | | |-- skey1: string (nullable = true) 8 | | |-- skey2: string (nullable = true) 9 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested3Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "key1" : 1, 3 | "struct2" : { 4 | "inner1" : { 5 | "key5" : 3, 6 | "key6" : 1, 7 | "skey1" : "1", 8 | "skey2" : "1 3 1" 9 | } 10 | } 11 | }, { 12 | "key1" : 2, 13 | "struct2" : { 14 | "inner1" : { 15 | "key5" : 2, 16 | "key6" : 3, 17 | "skey1" : "2", 18 | "skey2" : "2 2 3" 19 | } 20 | } 21 | }, { 22 | "key1" : 3, 23 | "struct2" : { 24 | "inner1" : { 25 | "key5" : 1, 26 | "key6" : 2, 27 | "skey1" : "3", 28 | "skey2" : "3 1 2" 29 | } 30 | } 31 | }, { 32 | "key1" : 2, 33 | "struct2" : { 34 | "inner1" : { 35 | "key5" : 3, 36 | "key6" : 2, 37 | "skey1" : "2", 38 | "skey2" : "2 3 2" 39 | } 40 | } 41 | }, { 42 | "key1" : 4, 43 | "struct2" : { 44 | "inner1" : { 45 | "key5" : 2, 46 | "key6" : 1, 47 | "skey1" : "3", 48 | "skey2" : "4 2 1" 49 | } 50 | } 51 | }, { 52 | "key1" : 1, 53 | "struct2" : { 54 | "inner1" : { 55 | "key5" : 1, 56 | "key6" : 2, 57 | "skey1" : "4", 58 | "skey2" : "1 1 2" 59 | } 60 | } 61 | }, { 62 | "key1" : 1, 63 | "struct2" : { 64 | "inner1" : { 65 | "key5" : 1 66 | } 67 | } 68 | }, { 69 | "key1" : 1, 70 | "struct2" : { 71 | "inner1" : { } 72 | } 73 | } ] 74 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested3Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- key1: long (nullable = true) 3 | |-- struct2: struct (nullable = false) 4 | | |-- inner1: struct (nullable = false) 5 | | | |-- key5: long (nullable = true) 6 | | | |-- key6: long (nullable = true) 7 | | | |-- skey1: string (nullable = true) 8 | | | |-- skey2: string (nullable = true) 9 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested4Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "key1" : 1, 3 | "array1" : [ { 4 | "key7" : 2, 5 | "key8" : 3, 6 | "skey2" : "1", 7 | "skey3" : "1 2 3" 8 | }, { 9 | "key7" : 1, 10 | "key8" : 2, 11 | "skey2" : "2", 12 | "skey3" : "1 1 2" 13 | }, { 14 | "key7" : 3, 15 | "key8" : 3, 16 | "skey2" : "3", 17 | "skey3" : "1 3 3" 18 | } ] 19 | }, { 20 | "key1" : 2, 21 | "array1" : [ { 22 | "key7" : 4, 23 | "key8" : 2, 24 | "skey2" : "2", 25 | "skey3" : "2 4 2" 26 | }, { 27 | "key7" : 3, 28 | "key8" : 1, 29 | "skey2" : "3", 30 | "skey3" : "2 3 1" 31 | }, { 32 | "key7" : 3, 33 | "key8" : 3, 34 | "skey2" : "3", 35 | "skey3" : "2 3 3" 36 | } ] 37 | }, { 38 | "key1" : 3, 39 | "array1" : [ ] 40 | }, { 41 | "key1" : 2, 42 | "array1" : [ ] 43 | }, { 44 | "key1" : 4, 45 | "array1" : [ ] 46 | }, { 47 | "key1" : 1, 48 | "array1" : [ ] 49 | }, { 50 | "key1" : 1, 51 | "array1" : [ ] 52 | }, { 53 | "key1" : 1 54 | } ] 55 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested4Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- key1: long (nullable = true) 3 | |-- array1: array (nullable = true) 4 | | |-- element: struct (containsNull = false) 5 | | | |-- key7: long (nullable = true) 6 | | | |-- key8: long (nullable = true) 7 | | | |-- skey2: string (nullable = true) 8 | | | |-- skey3: string (nullable = true) 9 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested5Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "key1" : 1, 3 | "array2" : [ { 4 | "key2" : 1, 5 | "inner2" : [ { 6 | "key9" : 1, 7 | "key10" : 2, 8 | "struct3" : { 9 | "k1" : 1, 10 | "k2" : 2 11 | }, 12 | "out" : "1 1 1 2" 13 | }, { 14 | "key9" : 1, 15 | "key10" : 2, 16 | "struct3" : { 17 | "k1" : 2, 18 | "k2" : 2 19 | }, 20 | "out" : "1 1 1 2" 21 | } ] 22 | }, { 23 | "key2" : 2, 24 | "inner2" : [ { 25 | "key9" : 3, 26 | "key10" : 1, 27 | "struct3" : { 28 | "k1" : 1, 29 | "k2" : 2 30 | }, 31 | "out" : "1 2 3 1" 32 | }, { 33 | "key9" : 2, 34 | "key10" : 2, 35 | "struct3" : { 36 | "k1" : 3, 37 | "k2" : 3 38 | }, 39 | "out" : "1 2 2 2" 40 | } ] 41 | } ] 42 | }, { 43 | "key1" : 2, 44 | "array2" : [ { 45 | "key2" : 2, 46 | "inner2" : [ { 47 | "key9" : 1, 48 | "key10" : 2, 49 | "struct3" : { 50 | "k1" : 1, 51 | "k2" : 1 52 | }, 53 | "out" : "2 2 1 2" 54 | }, { 55 | "key9" : 1, 56 | "key10" : 2, 57 | "struct3" : { 58 | "k1" : 1, 59 | "k2" : 1 60 | }, 61 | "out" : "2 2 1 2" 62 | } ] 63 | }, { 64 | "key2" : 3, 65 | "inner2" : [ { 66 | "key9" : 3, 67 | "key10" : 1, 68 | "struct3" : { 69 | "k1" : 2, 70 | "k2" : 1 71 | }, 72 | "out" : "2 3 3 1" 73 | }, { 74 | "key9" : 4, 75 | "key10" : 1, 76 | "struct3" : { 77 | "k1" : 3, 78 | "k2" : 3 79 | }, 80 | "out" : "2 3 4 1" 81 | } ] 82 | } ] 83 | }, { 84 | "key1" : 3, 85 | "array2" : [ { 86 | "key2" : 3, 87 | "inner2" : [ { 88 | "key9" : 2, 89 | "key10" : 3, 90 | "struct3" : { 91 | "k1" : 2, 92 | "k2" : 2 93 | }, 94 | "out" : "3 3 2 3" 95 | }, { 96 | "key9" : 2, 97 | "key10" : 1, 98 | "out" : "3 3 2 1" 99 | } ] 100 | }, { 101 | "key2" : 2, 102 | "inner2" : [ { 103 | "key9" : 3, 104 | "key10" : 1, 105 | "struct3" : { 106 | "k1" : 1, 107 | "k2" : 2 108 | }, 109 | "out" : "3 2 3 1" 110 | }, { 111 | "key9" : 2, 112 | "key10" : 1, 113 | "struct3" : { 114 | "k1" : 1, 115 | "k2" : 1 116 | }, 117 | "out" : "3 2 2 1" 118 | } ] 119 | } ] 120 | }, { 121 | "key1" : 2, 122 | "array2" : [ { 123 | "key2" : 4, 124 | "inner2" : [ ] 125 | }, { 126 | "key2" : 1, 127 | "inner2" : [ { 128 | "key9" : 2, 129 | "key10" : 2, 130 | "struct3" : { 131 | "k1" : 1, 132 | "k2" : 1 133 | }, 134 | "out" : "2 1 2 2" 135 | } ] 136 | } ] 137 | }, { 138 | "key1" : 4, 139 | "array2" : [ ] 140 | }, { 141 | "key1" : 1, 142 | "array2" : [ ] 143 | }, { 144 | "key1" : 1, 145 | "array2" : [ ] 146 | }, { 147 | "key1" : 1 148 | } ] 149 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested5Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- key1: long (nullable = true) 3 | |-- array2: array (nullable = true) 4 | | |-- element: struct (containsNull = false) 5 | | | |-- key2: long (nullable = true) 6 | | | |-- inner2: array (nullable = true) 7 | | | | |-- element: struct (containsNull = false) 8 | | | | | |-- key9: long (nullable = true) 9 | | | | | |-- key10: long (nullable = true) 10 | | | | | |-- struct3: struct (nullable = true) 11 | | | | | | |-- k1: integer (nullable = true) 12 | | | | | | |-- k2: integer (nullable = true) 13 | | | | | |-- out: string (nullable = true) 14 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested6Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "array2" : [ { 3 | "key2" : 1, 4 | "inner2" : [ { 5 | "key9" : 1, 6 | "key10" : 2, 7 | "struct3" : { 8 | "k1" : 1, 9 | "k2" : 2 10 | }, 11 | "out" : "2 1" 12 | }, { 13 | "key9" : 1, 14 | "key10" : 2, 15 | "struct3" : { 16 | "k1" : 2, 17 | "k2" : 2 18 | }, 19 | "out" : "2 2" 20 | } ] 21 | }, { 22 | "key2" : 2, 23 | "inner2" : [ { 24 | "key9" : 3, 25 | "key10" : 1, 26 | "struct3" : { 27 | "k1" : 1, 28 | "k2" : 2 29 | }, 30 | "out" : "1 1" 31 | }, { 32 | "key9" : 2, 33 | "key10" : 2, 34 | "struct3" : { 35 | "k1" : 3, 36 | "k2" : 3 37 | }, 38 | "out" : "2 3" 39 | } ] 40 | } ] 41 | }, { 42 | "array2" : [ { 43 | "key2" : 2, 44 | "inner2" : [ { 45 | "key9" : 1, 46 | "key10" : 2, 47 | "struct3" : { 48 | "k1" : 1, 49 | "k2" : 1 50 | }, 51 | "out" : "2 1" 52 | }, { 53 | "key9" : 1, 54 | "key10" : 2, 55 | "struct3" : { 56 | "k1" : 1, 57 | "k2" : 1 58 | }, 59 | "out" : "2 1" 60 | } ] 61 | }, { 62 | "key2" : 3, 63 | "inner2" : [ { 64 | "key9" : 3, 65 | "key10" : 1, 66 | "struct3" : { 67 | "k1" : 2, 68 | "k2" : 1 69 | }, 70 | "out" : "1 2" 71 | }, { 72 | "key9" : 4, 73 | "key10" : 1, 74 | "struct3" : { 75 | "k1" : 3, 76 | "k2" : 3 77 | }, 78 | "out" : "1 3" 79 | } ] 80 | } ] 81 | }, { 82 | "array2" : [ { 83 | "key2" : 3, 84 | "inner2" : [ { 85 | "key9" : 2, 86 | "key10" : 3, 87 | "struct3" : { 88 | "k1" : 2, 89 | "k2" : 2 90 | }, 91 | "out" : "3 2" 92 | }, { 93 | "key9" : 2, 94 | "key10" : 1 95 | } ] 96 | }, { 97 | "key2" : 2, 98 | "inner2" : [ { 99 | "key9" : 3, 100 | "key10" : 1, 101 | "struct3" : { 102 | "k1" : 1, 103 | "k2" : 2 104 | }, 105 | "out" : "1 1" 106 | }, { 107 | "key9" : 2, 108 | "key10" : 1, 109 | "struct3" : { 110 | "k1" : 1, 111 | "k2" : 1 112 | }, 113 | "out" : "1 1" 114 | } ] 115 | } ] 116 | }, { 117 | "array2" : [ { 118 | "key2" : 4, 119 | "inner2" : [ ] 120 | }, { 121 | "key2" : 1, 122 | "inner2" : [ { 123 | "key9" : 2, 124 | "key10" : 2, 125 | "struct3" : { 126 | "k1" : 1, 127 | "k2" : 1 128 | }, 129 | "out" : "2 1" 130 | } ] 131 | } ] 132 | }, { 133 | "array2" : [ ] 134 | }, { 135 | "array2" : [ ] 136 | }, { 137 | "array2" : [ ] 138 | }, { } ] 139 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested6Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- array2: array (nullable = true) 3 | | |-- element: struct (containsNull = false) 4 | | | |-- key2: long (nullable = true) 5 | | | |-- inner2: array (nullable = true) 6 | | | | |-- element: struct (containsNull = false) 7 | | | | | |-- key9: long (nullable = true) 8 | | | | | |-- key10: long (nullable = true) 9 | | | | | |-- struct3: struct (nullable = true) 10 | | | | | | |-- k1: integer (nullable = true) 11 | | | | | | |-- k2: integer (nullable = true) 12 | | | | | |-- out: string (nullable = true) 13 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested7Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "array2" : [ { 3 | "key2" : 1, 4 | "inner2" : [ { 5 | "key9" : 1, 6 | "key10" : 2, 7 | "struct3" : { 8 | "k1" : 1, 9 | "k2" : 2, 10 | "out" : "1 2" 11 | } 12 | }, { 13 | "key9" : 1, 14 | "key10" : 2, 15 | "struct3" : { 16 | "k1" : 2, 17 | "k2" : 2, 18 | "out" : "2 2" 19 | } 20 | } ] 21 | }, { 22 | "key2" : 2, 23 | "inner2" : [ { 24 | "key9" : 3, 25 | "key10" : 1, 26 | "struct3" : { 27 | "k1" : 1, 28 | "k2" : 2, 29 | "out" : "1 1" 30 | } 31 | }, { 32 | "key9" : 2, 33 | "key10" : 2, 34 | "struct3" : { 35 | "k1" : 3, 36 | "k2" : 3, 37 | "out" : "3 2" 38 | } 39 | } ] 40 | } ] 41 | }, { 42 | "array2" : [ { 43 | "key2" : 2, 44 | "inner2" : [ { 45 | "key9" : 1, 46 | "key10" : 2, 47 | "struct3" : { 48 | "k1" : 1, 49 | "k2" : 1, 50 | "out" : "1 2" 51 | } 52 | }, { 53 | "key9" : 1, 54 | "key10" : 2, 55 | "struct3" : { 56 | "k1" : 1, 57 | "k2" : 1, 58 | "out" : "1 2" 59 | } 60 | } ] 61 | }, { 62 | "key2" : 3, 63 | "inner2" : [ { 64 | "key9" : 3, 65 | "key10" : 1, 66 | "struct3" : { 67 | "k1" : 2, 68 | "k2" : 1, 69 | "out" : "2 1" 70 | } 71 | }, { 72 | "key9" : 4, 73 | "key10" : 1, 74 | "struct3" : { 75 | "k1" : 3, 76 | "k2" : 3, 77 | "out" : "3 1" 78 | } 79 | } ] 80 | } ] 81 | }, { 82 | "array2" : [ { 83 | "key2" : 3, 84 | "inner2" : [ { 85 | "key9" : 2, 86 | "key10" : 3, 87 | "struct3" : { 88 | "k1" : 2, 89 | "k2" : 2, 90 | "out" : "2 3" 91 | } 92 | }, { 93 | "key9" : 2, 94 | "key10" : 1, 95 | "struct3" : { } 96 | } ] 97 | }, { 98 | "key2" : 2, 99 | "inner2" : [ { 100 | "key9" : 3, 101 | "key10" : 1, 102 | "struct3" : { 103 | "k1" : 1, 104 | "k2" : 2, 105 | "out" : "1 1" 106 | } 107 | }, { 108 | "key9" : 2, 109 | "key10" : 1, 110 | "struct3" : { 111 | "k1" : 1, 112 | "k2" : 1, 113 | "out" : "1 1" 114 | } 115 | } ] 116 | } ] 117 | }, { 118 | "array2" : [ { 119 | "key2" : 4, 120 | "inner2" : [ ] 121 | }, { 122 | "key2" : 1, 123 | "inner2" : [ { 124 | "key9" : 2, 125 | "key10" : 2, 126 | "struct3" : { 127 | "k1" : 1, 128 | "k2" : 1, 129 | "out" : "1 2" 130 | } 131 | } ] 132 | } ] 133 | }, { 134 | "array2" : [ ] 135 | }, { 136 | "array2" : [ ] 137 | }, { 138 | "array2" : [ ] 139 | }, { } ] 140 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested7Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- array2: array (nullable = true) 3 | | |-- element: struct (containsNull = false) 4 | | | |-- key2: long (nullable = true) 5 | | | |-- inner2: array (nullable = true) 6 | | | | |-- element: struct (containsNull = false) 7 | | | | | |-- key9: long (nullable = true) 8 | | | | | |-- key10: long (nullable = true) 9 | | | | | |-- struct3: struct (nullable = false) 10 | | | | | | |-- k1: integer (nullable = true) 11 | | | | | | |-- k2: integer (nullable = true) 12 | | | | | | |-- out: string (nullable = true) 13 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested8Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "array2" : [ { 3 | "key2" : 1, 4 | "inner2" : [ { 5 | "key9" : 1, 6 | "key10" : 2, 7 | "struct3" : { 8 | "k1" : 1, 9 | "k2" : 2 10 | }, 11 | "out" : "2 1" 12 | }, { 13 | "key9" : 1, 14 | "key10" : 2, 15 | "struct3" : { 16 | "k1" : 2, 17 | "k2" : 2 18 | }, 19 | "out" : "2 2" 20 | } ] 21 | }, { 22 | "key2" : 2, 23 | "inner2" : [ { 24 | "key9" : 3, 25 | "key10" : 1, 26 | "struct3" : { 27 | "k1" : 1, 28 | "k2" : 2 29 | }, 30 | "out" : "1 1" 31 | }, { 32 | "key9" : 2, 33 | "key10" : 2, 34 | "struct3" : { 35 | "k1" : 3, 36 | "k2" : 3 37 | }, 38 | "out" : "2 3" 39 | } ] 40 | } ], 41 | "errCol" : [ { 42 | "errType" : "confCastError", 43 | "errCode" : "E00003", 44 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 45 | "errCol" : "k1!==1", 46 | "rawValues" : [ "2" ], 47 | "mappings" : [ ] 48 | }, { 49 | "errType" : "confCastError", 50 | "errCode" : "E00003", 51 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 52 | "errCol" : "k1!==1", 53 | "rawValues" : [ "3" ], 54 | "mappings" : [ ] 55 | } ] 56 | }, { 57 | "array2" : [ { 58 | "key2" : 2, 59 | "inner2" : [ { 60 | "key9" : 1, 61 | "key10" : 2, 62 | "struct3" : { 63 | "k1" : 1, 64 | "k2" : 1 65 | }, 66 | "out" : "2 1" 67 | }, { 68 | "key9" : 1, 69 | "key10" : 2, 70 | "struct3" : { 71 | "k1" : 1, 72 | "k2" : 1 73 | }, 74 | "out" : "2 1" 75 | } ] 76 | }, { 77 | "key2" : 3, 78 | "inner2" : [ { 79 | "key9" : 3, 80 | "key10" : 1, 81 | "struct3" : { 82 | "k1" : 2, 83 | "k2" : 1 84 | }, 85 | "out" : "1 2" 86 | }, { 87 | "key9" : 4, 88 | "key10" : 1, 89 | "struct3" : { 90 | "k1" : 3, 91 | "k2" : 3 92 | }, 93 | "out" : "1 3" 94 | } ] 95 | } ], 96 | "errCol" : [ { 97 | "errType" : "confCastError", 98 | "errCode" : "E00003", 99 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 100 | "errCol" : "k1!==1", 101 | "rawValues" : [ "2" ], 102 | "mappings" : [ ] 103 | }, { 104 | "errType" : "confCastError", 105 | "errCode" : "E00003", 106 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 107 | "errCol" : "k1!==1", 108 | "rawValues" : [ "3" ], 109 | "mappings" : [ ] 110 | } ] 111 | }, { 112 | "array2" : [ { 113 | "key2" : 3, 114 | "inner2" : [ { 115 | "key9" : 2, 116 | "key10" : 3, 117 | "struct3" : { 118 | "k1" : 2, 119 | "k2" : 2 120 | }, 121 | "out" : "3 2" 122 | }, { 123 | "key9" : 2, 124 | "key10" : 1 125 | } ] 126 | }, { 127 | "key2" : 2, 128 | "inner2" : [ { 129 | "key9" : 3, 130 | "key10" : 1, 131 | "struct3" : { 132 | "k1" : 1, 133 | "k2" : 2 134 | }, 135 | "out" : "1 1" 136 | }, { 137 | "key9" : 2, 138 | "key10" : 1, 139 | "struct3" : { 140 | "k1" : 1, 141 | "k2" : 1 142 | }, 143 | "out" : "1 1" 144 | } ] 145 | } ], 146 | "errCol" : [ { 147 | "errType" : "confCastError", 148 | "errCode" : "E00003", 149 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 150 | "errCol" : "k1!==1", 151 | "rawValues" : [ "2" ], 152 | "mappings" : [ ] 153 | } ] 154 | }, { 155 | "array2" : [ { 156 | "key2" : 4, 157 | "inner2" : [ ] 158 | }, { 159 | "key2" : 1, 160 | "inner2" : [ { 161 | "key9" : 2, 162 | "key10" : 2, 163 | "struct3" : { 164 | "k1" : 1, 165 | "k2" : 1 166 | }, 167 | "out" : "2 1" 168 | } ] 169 | } ], 170 | "errCol" : [ ] 171 | }, { 172 | "array2" : [ ], 173 | "errCol" : [ ] 174 | }, { 175 | "array2" : [ ], 176 | "errCol" : [ ] 177 | }, { 178 | "array2" : [ ], 179 | "errCol" : [ ] 180 | }, { 181 | "errCol" : [ ] 182 | } ] 183 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested8Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- array2: array (nullable = true) 3 | | |-- element: struct (containsNull = false) 4 | | | |-- key2: long (nullable = true) 5 | | | |-- inner2: array (nullable = true) 6 | | | | |-- element: struct (containsNull = false) 7 | | | | | |-- key9: long (nullable = true) 8 | | | | | |-- key10: long (nullable = true) 9 | | | | | |-- struct3: struct (nullable = true) 10 | | | | | | |-- k1: integer (nullable = true) 11 | | | | | | |-- k2: integer (nullable = true) 12 | | | | | |-- out: string (nullable = true) 13 | |-- errCol: array (nullable = true) 14 | | |-- element: struct (containsNull = true) 15 | | | |-- errType: string (nullable = true) 16 | | | |-- errCode: string (nullable = true) 17 | | | |-- errMsg: string (nullable = true) 18 | | | |-- errCol: string (nullable = true) 19 | | | |-- rawValues: array (nullable = true) 20 | | | | |-- element: string (containsNull = true) 21 | | | |-- mappings: array (nullable = true) 22 | | | | |-- element: struct (containsNull = true) 23 | | | | | |-- mappingTableColumn: string (nullable = true) 24 | | | | | |-- mappedDatasetColumn: string (nullable = true) 25 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested9Results.json: -------------------------------------------------------------------------------- 1 | [ { 2 | "array2" : [ { 3 | "key2" : 1, 4 | "inner2" : [ { 5 | "key9" : 1, 6 | "key10" : 2, 7 | "struct3" : { 8 | "k1" : 1, 9 | "k2" : 2 10 | }, 11 | "out" : "2 1" 12 | }, { 13 | "key9" : 1, 14 | "key10" : 2, 15 | "struct3" : { 16 | "k1" : 2, 17 | "k2" : 2 18 | }, 19 | "out" : "2 2" 20 | } ] 21 | }, { 22 | "key2" : 2, 23 | "inner2" : [ { 24 | "key9" : 3, 25 | "key10" : 1, 26 | "struct3" : { 27 | "k1" : 1, 28 | "k2" : 2 29 | }, 30 | "out" : "1 1" 31 | }, { 32 | "key9" : 2, 33 | "key10" : 2, 34 | "struct3" : { 35 | "k1" : 3, 36 | "k2" : 3 37 | }, 38 | "out" : "2 3" 39 | } ] 40 | } ], 41 | "errCol" : [ { 42 | "errType" : "Initial", 43 | "errCode" : "000", 44 | "errMsg" : "ErrMsg", 45 | "errCol" : "id", 46 | "rawValues" : [ ], 47 | "mappings" : [ ] 48 | }, { 49 | "errType" : "confCastError", 50 | "errCode" : "E00003", 51 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 52 | "errCol" : "k1!==1", 53 | "rawValues" : [ "2" ], 54 | "mappings" : [ ] 55 | }, { 56 | "errType" : "confCastError", 57 | "errCode" : "E00003", 58 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 59 | "errCol" : "k1!==1", 60 | "rawValues" : [ "3" ], 61 | "mappings" : [ ] 62 | } ] 63 | }, { 64 | "array2" : [ { 65 | "key2" : 2, 66 | "inner2" : [ { 67 | "key9" : 1, 68 | "key10" : 2, 69 | "struct3" : { 70 | "k1" : 1, 71 | "k2" : 1 72 | }, 73 | "out" : "2 1" 74 | }, { 75 | "key9" : 1, 76 | "key10" : 2, 77 | "struct3" : { 78 | "k1" : 1, 79 | "k2" : 1 80 | }, 81 | "out" : "2 1" 82 | } ] 83 | }, { 84 | "key2" : 3, 85 | "inner2" : [ { 86 | "key9" : 3, 87 | "key10" : 1, 88 | "struct3" : { 89 | "k1" : 2, 90 | "k2" : 1 91 | }, 92 | "out" : "1 2" 93 | }, { 94 | "key9" : 4, 95 | "key10" : 1, 96 | "struct3" : { 97 | "k1" : 3, 98 | "k2" : 3 99 | }, 100 | "out" : "1 3" 101 | } ] 102 | } ], 103 | "errCol" : [ { 104 | "errType" : "Initial", 105 | "errCode" : "000", 106 | "errMsg" : "ErrMsg", 107 | "errCol" : "id", 108 | "rawValues" : [ ], 109 | "mappings" : [ ] 110 | }, { 111 | "errType" : "confCastError", 112 | "errCode" : "E00003", 113 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 114 | "errCol" : "k1!==1", 115 | "rawValues" : [ "2" ], 116 | "mappings" : [ ] 117 | }, { 118 | "errType" : "confCastError", 119 | "errCode" : "E00003", 120 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 121 | "errCol" : "k1!==1", 122 | "rawValues" : [ "3" ], 123 | "mappings" : [ ] 124 | } ] 125 | }, { 126 | "array2" : [ { 127 | "key2" : 3, 128 | "inner2" : [ { 129 | "key9" : 2, 130 | "key10" : 3, 131 | "struct3" : { 132 | "k1" : 2, 133 | "k2" : 2 134 | }, 135 | "out" : "3 2" 136 | }, { 137 | "key9" : 2, 138 | "key10" : 1 139 | } ] 140 | }, { 141 | "key2" : 2, 142 | "inner2" : [ { 143 | "key9" : 3, 144 | "key10" : 1, 145 | "struct3" : { 146 | "k1" : 1, 147 | "k2" : 2 148 | }, 149 | "out" : "1 1" 150 | }, { 151 | "key9" : 2, 152 | "key10" : 1, 153 | "struct3" : { 154 | "k1" : 1, 155 | "k2" : 1 156 | }, 157 | "out" : "1 1" 158 | } ] 159 | } ], 160 | "errCol" : [ { 161 | "errType" : "Initial", 162 | "errCode" : "000", 163 | "errMsg" : "ErrMsg", 164 | "errCol" : "id", 165 | "rawValues" : [ ], 166 | "mappings" : [ ] 167 | }, { 168 | "errType" : "confCastError", 169 | "errCode" : "E00003", 170 | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 171 | "errCol" : "k1!==1", 172 | "rawValues" : [ "2" ], 173 | "mappings" : [ ] 174 | } ] 175 | }, { 176 | "array2" : [ { 177 | "key2" : 4, 178 | "inner2" : [ ] 179 | }, { 180 | "key2" : 1, 181 | "inner2" : [ { 182 | "key9" : 2, 183 | "key10" : 2, 184 | "struct3" : { 185 | "k1" : 1, 186 | "k2" : 1 187 | }, 188 | "out" : "2 1" 189 | } ] 190 | } ], 191 | "errCol" : [ { 192 | "errType" : "Initial", 193 | "errCode" : "000", 194 | "errMsg" : "ErrMsg", 195 | "errCol" : "id", 196 | "rawValues" : [ ], 197 | "mappings" : [ ] 198 | } ] 199 | }, { 200 | "array2" : [ ], 201 | "errCol" : [ { 202 | "errType" : "Initial", 203 | "errCode" : "000", 204 | "errMsg" : "ErrMsg", 205 | "errCol" : "id", 206 | "rawValues" : [ ], 207 | "mappings" : [ ] 208 | } ] 209 | }, { 210 | "array2" : [ ], 211 | "errCol" : [ { 212 | "errType" : "Initial", 213 | "errCode" : "000", 214 | "errMsg" : "ErrMsg", 215 | "errCol" : "id", 216 | "rawValues" : [ ], 217 | "mappings" : [ ] 218 | } ] 219 | }, { 220 | "array2" : [ ], 221 | "errCol" : [ { 222 | "errType" : "Initial", 223 | "errCode" : "000", 224 | "errMsg" : "ErrMsg", 225 | "errCol" : "id", 226 | "rawValues" : [ ], 227 | "mappings" : [ ] 228 | } ] 229 | }, { 230 | "errCol" : [ { 231 | "errType" : "Initial", 232 | "errCode" : "000", 233 | "errMsg" : "ErrMsg", 234 | "errCol" : "id", 235 | "rawValues" : [ ], 236 | "mappings" : [ ] 237 | } ] 238 | } ] 239 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nested9Schema.txt: -------------------------------------------------------------------------------- 1 | root 2 | |-- array2: array (nullable = true) 3 | | |-- element: struct (containsNull = false) 4 | | | |-- key2: long (nullable = true) 5 | | | |-- inner2: array (nullable = true) 6 | | | | |-- element: struct (containsNull = false) 7 | | | | | |-- key9: long (nullable = true) 8 | | | | | |-- key10: long (nullable = true) 9 | | | | | |-- struct3: struct (nullable = true) 10 | | | | | | |-- k1: integer (nullable = true) 11 | | | | | | |-- k2: integer (nullable = true) 12 | | | | | |-- out: string (nullable = true) 13 | |-- errCol: array (nullable = true) 14 | | |-- element: struct (containsNull = true) 15 | | | |-- errType: string (nullable = true) 16 | | | |-- errCode: string (nullable = true) 17 | | | |-- errMsg: string (nullable = true) 18 | | | |-- errCol: string (nullable = true) 19 | | | |-- rawValues: array (nullable = true) 20 | | | | |-- element: string (containsNull = true) 21 | | | |-- mappings: array (nullable = true) 22 | | | | |-- element: struct (containsNull = true) 23 | | | | | |-- mappingTableColumn: string (nullable = true) 24 | | | | | |-- mappedDatasetColumn: string (nullable = true) 25 | -------------------------------------------------------------------------------- /src/test/resources/test_data/nested/nestedDf1.json: -------------------------------------------------------------------------------- 1 | {"id":1,"key1":1,"key2":2,"struct1":{"key3":3,"key4":1},"struct2":{"inner1":{"key5":3,"key6":1,"skey1":"1"}},"struct3":{"inner3":{"array3":[{"a1":3,"a2":1,"a3":"1"},{"a1":4,"a2":2,"a3":"5"}]}},"array1":[{"key7":2,"key8":3,"skey2":"1"},{"key7":1,"key8":2,"skey2":"2"},{"key7":3,"key8":3,"skey2":"3"}],"array2":[{"key2":1,"inner2":[{"key9":1,"key10":2,"struct3":{"k1":1,"k2":2}},{"key9":1,"key10":2,"struct3":{"k1":2,"k2":2}}]},{"key2":2,"inner2":[{"key9":3,"key10":1,"struct3":{"k1":1,"k2":2}},{"key9":2,"key10":2,"struct3":{"k1":3,"k2":3}}]}]} 2 | {"id":2,"key1":2,"key2":1,"struct1":{"key3":2,"key4":3},"struct2":{"inner1":{"key5":2,"key6":3,"skey1":"2"}},"struct3":{"inner3":{"array3":[{"a1":4,"a2":2,"a3":"3"},{"a1":8,"a2":2,"a3":"5"}]}},"array1":[{"key7":4,"key8":2,"skey2":"2"},{"key7":3,"key8":1,"skey2":"3"},{"key7":3,"key8":3,"skey2":"3"}],"array2":[{"key2":2,"inner2":[{"key9":1,"key10":2,"struct3":{"k1":1,"k2":1}},{"key9":1,"key10":2,"struct3":{"k1":1,"k2":1}}]},{"key2":3,"inner2":[{"key9":3,"key10":1,"struct3":{"k1":2,"k2":1}},{"key9":4,"key10":1,"struct3":{"k1":3,"k2":3}}]}]} 3 | {"id":3,"key1":3,"key2":2,"struct1":{"key3":1,"key4":2},"struct2":{"inner1":{"key5":1,"key6":2,"skey1":"3"}},"struct3":{"inner3":{"array3":[{"a1":5,"a2":3,"a3":"4"},{"a1":8,"a2":4,"a3":"7"}]}},"array1":[],"array2":[{"key2":3,"inner2":[{"key9":2,"key10":3,"struct3":{"k1":2,"k2":2}},{"key9":2,"key10":1}]},{"key2":2,"inner2":[{"key9":3,"key10":1,"struct3":{"k1":1,"k2":2}},{"key9":2,"key10":1,"struct3":{"k1":1,"k2":1}}]}]} 4 | {"id":4,"key1":2,"key2":3,"struct1":{"key3":2,"key4":1},"struct2":{"inner1":{"key5":3,"key6":2,"skey1":"2"}},"struct3":{"inner3":{"array3":[{"a1":6,"a2":4,"a3":"6"},{"a1":9,"a2":3,"a3":"7"}]}},"array1":[],"array2":[{"key2":4,"inner2":[]},{"key2":1,"inner2":[{"key9":2,"key10":2,"struct3":{"k1":1,"k2":1}}]}]} 5 | {"id":5,"key1":4,"key2":1,"struct1":{"key3":3,"key4":3},"struct2":{"inner1":{"key5":2,"key6":1,"skey1":"3"}},"struct3":{"inner3":{"array3":[{"a1":7,"a2":5,"a3":"7"}]}},"array1":[],"array2":[]} 6 | {"id":6,"key1":1,"key2":3,"struct1":{"key3":1,"key4":2},"struct2":{"inner1":{"key5":1,"key6":2,"skey1":"4"}},"struct3":{"inner3":{"array3":[{"a1":4,"a2":6,"a3":"5"}]}},"array1":[],"array2":[]} 7 | {"id":7,"key1":1,"key2":3,"struct1":{"key3":1,"key4":2},"struct2":{"inner1":{"key5":1}},"array1":[],"array2":[]} 8 | {"id":8,"key1":1,"struct1":{"key3":1}} -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/SparkTestBase.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats 18 | 19 | import org.apache.log4j.{Level, Logger} 20 | import org.apache.spark.sql.SparkSession 21 | 22 | trait SparkTestBase { 23 | System.setProperty("user.timezone", "UTC") 24 | 25 | // Do not display INFO entries for tests 26 | Logger.getLogger("org").setLevel(Level.WARN) 27 | Logger.getLogger("akka").setLevel(Level.WARN) 28 | 29 | implicit val spark: SparkSession = SparkSession 30 | .builder() 31 | .master("local[2]") 32 | .appName("test") 33 | .config("spark.ui.enabled", "false") 34 | .config("spark.driver.bindAddress","127.0.0.1") 35 | .config("spark.driver.host", "localhost") 36 | .getOrCreate() 37 | } 38 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/DeepArrayErrorTransformationSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations 18 | 19 | import org.apache.spark.sql.DataFrame 20 | import org.apache.spark.sql.functions._ 21 | import org.apache.spark.sql.types.{IntegerType, StringType} 22 | import org.scalatest.funsuite.AnyFunSuite 23 | import org.slf4j.LoggerFactory 24 | import za.co.absa.spark.hats.SparkTestBase 25 | import za.co.absa.spark.hats.transformations.samples.DeepArraySamples._ 26 | import za.co.absa.spark.hats.transformations.samples.SampleErrorUDFs 27 | import za.co.absa.spark.hats.utils.JsonUtils 28 | 29 | class DeepArrayErrorTransformationSuite extends AnyFunSuite with SparkTestBase { 30 | // scalastyle:off line.size.limit 31 | // scalastyle:off null 32 | 33 | import spark.implicits._ 34 | import za.co.absa.spark.hats.Extensions._ 35 | implicit val _: SampleErrorUDFs = new SampleErrorUDFs 36 | 37 | private val log = LoggerFactory.getLogger(this.getClass) 38 | 39 | test("Test casting of a plain field with error column") { 40 | val df = spark.sparkContext.parallelize(plainSampleE).toDF 41 | 42 | val expectedSchema = 43 | """root 44 | | |-- city: string (nullable = true) 45 | | |-- street: string (nullable = true) 46 | | |-- buildingNum: integer (nullable = false) 47 | | |-- zip: string (nullable = true) 48 | | |-- errors: array (nullable = true) 49 | | | |-- element: struct (containsNull = true) 50 | | | | |-- errType: string (nullable = true) 51 | | | | |-- errCode: string (nullable = true) 52 | | | | |-- errMsg: string (nullable = true) 53 | | | | |-- errCol: string (nullable = true) 54 | | | | |-- rawValues: array (nullable = true) 55 | | | | | |-- element: string (containsNull = true) 56 | | | | |-- mappings: array (nullable = true) 57 | | | | | |-- element: struct (containsNull = true) 58 | | | | | | |-- mappingTableColumn: string (nullable = true) 59 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 60 | | |-- intZip: integer (nullable = true) 61 | |""".stripMargin.replace("\r\n", "\n") 62 | val expectedResults = 63 | """[ { 64 | | "city" : "Olomuc", 65 | | "street" : "Vodickova", 66 | | "buildingNum" : 12, 67 | | "zip" : "12000", 68 | | "errors" : [ ], 69 | | "intZip" : 12000 70 | |}, { 71 | | "city" : "Ostrava", 72 | | "street" : "Vlavska", 73 | | "buildingNum" : 110, 74 | | "zip" : "1455a", 75 | | "errors" : [ { 76 | | "errType" : "confCastError", 77 | | "errCode" : "E00003", 78 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 79 | | "errCol" : "intZip", 80 | | "rawValues" : [ "1455a" ], 81 | | "mappings" : [ ] 82 | | } ] 83 | |}, { 84 | | "city" : "Plzen", 85 | | "street" : "Kralova", 86 | | "buildingNum" : 71, 87 | | "zip" : "b881", 88 | | "errors" : [ { 89 | | "errType" : "myErrorType", 90 | | "errCode" : "E-1", 91 | | "errMsg" : "Testing This stuff", 92 | | "errCol" : "whatEvColumn", 93 | | "rawValues" : [ "some value" ], 94 | | "mappings" : [ ] 95 | | }, { 96 | | "errType" : "confCastError", 97 | | "errCode" : "E00003", 98 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 99 | | "errCol" : "intZip", 100 | | "rawValues" : [ "b881" ], 101 | | "mappings" : [ ] 102 | | } ] 103 | |} ]""" 104 | .stripMargin.replace("\r\n", "\n") 105 | 106 | processCastExample(df, "zip", "intZip", expectedSchema, expectedResults) 107 | } 108 | 109 | test("Test casting of a struct of struct field with error column") { 110 | val df = spark.sparkContext.parallelize(structOfStructSampleE).toDF 111 | 112 | val expectedSchema = 113 | """root 114 | | |-- id: integer (nullable = false) 115 | | |-- employee: struct (nullable = false) 116 | | | |-- name: string (nullable = true) 117 | | | |-- address: struct (nullable = false) 118 | | | | |-- city: string (nullable = true) 119 | | | | |-- street: string (nullable = true) 120 | | | | |-- buildingNum: integer (nullable = true) 121 | | | | |-- zip: string (nullable = true) 122 | | | | |-- intZip: integer (nullable = true) 123 | | |-- errors: array (nullable = true) 124 | | | |-- element: struct (containsNull = true) 125 | | | | |-- errType: string (nullable = true) 126 | | | | |-- errCode: string (nullable = true) 127 | | | | |-- errMsg: string (nullable = true) 128 | | | | |-- errCol: string (nullable = true) 129 | | | | |-- rawValues: array (nullable = true) 130 | | | | | |-- element: string (containsNull = true) 131 | | | | |-- mappings: array (nullable = true) 132 | | | | | |-- element: struct (containsNull = true) 133 | | | | | | |-- mappingTableColumn: string (nullable = true) 134 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 135 | |""".stripMargin.replace("\r\n", "\n") 136 | val expectedResults = 137 | """[ { 138 | | "id" : 1, 139 | | "employee" : { 140 | | "name" : "Martin", 141 | | "address" : { 142 | | "city" : "Olomuc", 143 | | "street" : "Vodickova", 144 | | "buildingNum" : 12, 145 | | "zip" : "12000", 146 | | "intZip" : 12000 147 | | } 148 | | }, 149 | | "errors" : [ ] 150 | |}, { 151 | | "id" : 1, 152 | | "employee" : { 153 | | "name" : "Petr", 154 | | "address" : { 155 | | "city" : "Ostrava", 156 | | "street" : "Vlavska", 157 | | "buildingNum" : 110, 158 | | "zip" : "1455a" 159 | | } 160 | | }, 161 | | "errors" : [ { 162 | | "errType" : "myErrorType", 163 | | "errCode" : "E-1", 164 | | "errMsg" : "Testing This stuff", 165 | | "errCol" : "whatEvColumn", 166 | | "rawValues" : [ "some value" ], 167 | | "mappings" : [ ] 168 | | }, { 169 | | "errType" : "confCastError", 170 | | "errCode" : "E00003", 171 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 172 | | "errCol" : "employee.address.intZip", 173 | | "rawValues" : [ "1455a" ], 174 | | "mappings" : [ ] 175 | | } ] 176 | |}, { 177 | | "id" : 1, 178 | | "employee" : { 179 | | "name" : "Vojta", 180 | | "address" : { 181 | | "city" : "Plzen", 182 | | "street" : "Kralova", 183 | | "buildingNum" : 71, 184 | | "zip" : "b881" 185 | | } 186 | | }, 187 | | "errors" : [ { 188 | | "errType" : "confCastError", 189 | | "errCode" : "E00003", 190 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 191 | | "errCol" : "employee.address.intZip", 192 | | "rawValues" : [ "b881" ], 193 | | "mappings" : [ ] 194 | | } ] 195 | |} ]""" 196 | .stripMargin.replace("\r\n", "\n") 197 | 198 | processCastExample(df, "employee.address.zip", "employee.address.intZip", 199 | expectedSchema, expectedResults) 200 | } 201 | 202 | test("Test casting of an array of struct of struct with error column") { 203 | val df = spark.sparkContext.parallelize(arrayOfStructOfStructErrSampleE).toDF 204 | 205 | val expectedSchema = 206 | """root 207 | | |-- id: integer (nullable = false) 208 | | |-- employee: array (nullable = true) 209 | | | |-- element: struct (containsNull = false) 210 | | | | |-- name: string (nullable = true) 211 | | | | |-- address: struct (nullable = false) 212 | | | | | |-- city: string (nullable = true) 213 | | | | | |-- street: string (nullable = true) 214 | | | | | |-- buildingNum: integer (nullable = true) 215 | | | | | |-- zip: string (nullable = true) 216 | | | | | |-- intZip: integer (nullable = true) 217 | | |-- errors: array (nullable = true) 218 | | | |-- element: struct (containsNull = true) 219 | | | | |-- errType: string (nullable = true) 220 | | | | |-- errCode: string (nullable = true) 221 | | | | |-- errMsg: string (nullable = true) 222 | | | | |-- errCol: string (nullable = true) 223 | | | | |-- rawValues: array (nullable = true) 224 | | | | | |-- element: string (containsNull = true) 225 | | | | |-- mappings: array (nullable = true) 226 | | | | | |-- element: struct (containsNull = true) 227 | | | | | | |-- mappingTableColumn: string (nullable = true) 228 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 229 | |""".stripMargin.replace("\r\n", "\n") 230 | val expectedResults = 231 | """[ { 232 | | "id" : 1, 233 | | "employee" : [ { 234 | | "name" : "Martin", 235 | | "address" : { 236 | | "city" : "Olomuc", 237 | | "street" : "Vodickova", 238 | | "buildingNum" : 732, 239 | | "zip" : "73200", 240 | | "intZip" : 73200 241 | | } 242 | | }, { 243 | | "name" : "Stephan", 244 | | "address" : { 245 | | "city" : "Olomuc", 246 | | "street" : "Vodickova", 247 | | "buildingNum" : 77, 248 | | "zip" : "77-333" 249 | | } 250 | | } ], 251 | | "errors" : [ { 252 | | "errType" : "confCastError", 253 | | "errCode" : "E00003", 254 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 255 | | "errCol" : "employee.address.intZip", 256 | | "rawValues" : [ "77-333" ], 257 | | "mappings" : [ ] 258 | | } ] 259 | |}, { 260 | | "id" : 2, 261 | | "employee" : [ { 262 | | "name" : "Petr", 263 | | "address" : { 264 | | "city" : "Ostrava", 265 | | "street" : "Vlavska", 266 | | "buildingNum" : 25, 267 | | "zip" : "a9991" 268 | | } 269 | | }, { 270 | | "name" : "Michal", 271 | | "address" : { 272 | | "city" : "Ostrava", 273 | | "street" : "Vlavska", 274 | | "buildingNum" : 334, 275 | | "zip" : "552-aa1" 276 | | } 277 | | } ], 278 | | "errors" : [ { 279 | | "errType" : "myErrorType", 280 | | "errCode" : "E-1", 281 | | "errMsg" : "Testing This stuff", 282 | | "errCol" : "whatEvColumn", 283 | | "rawValues" : [ "some value" ], 284 | | "mappings" : [ ] 285 | | }, { 286 | | "errType" : "confCastError", 287 | | "errCode" : "E00003", 288 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 289 | | "errCol" : "employee.address.intZip", 290 | | "rawValues" : [ "a9991" ], 291 | | "mappings" : [ ] 292 | | }, { 293 | | "errType" : "confCastError", 294 | | "errCode" : "E00003", 295 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 296 | | "errCol" : "employee.address.intZip", 297 | | "rawValues" : [ "552-aa1" ], 298 | | "mappings" : [ ] 299 | | } ] 300 | |}, { 301 | | "id" : 3, 302 | | "employee" : [ { 303 | | "name" : "Vojta", 304 | | "address" : { 305 | | "city" : "Plzen", 306 | | "street" : "Kralova", 307 | | "buildingNum" : 33, 308 | | "zip" : "993", 309 | | "intZip" : 993 310 | | } 311 | | } ], 312 | | "errors" : [ ] 313 | |} ]""" 314 | .stripMargin.replace("\r\n", "\n") 315 | 316 | processCastExample(df, "employee.address.zip", "employee.address.intZip", 317 | expectedSchema, expectedResults) 318 | } 319 | 320 | test("Test casting of an array of primitives") { 321 | val df = spark.sparkContext.parallelize(arraysOfPrimitivesSampleE).toDF 322 | 323 | val expectedSchema = 324 | """root 325 | | |-- id: integer (nullable = false) 326 | | |-- nums: array (nullable = true) 327 | | | |-- element: string (containsNull = true) 328 | | |-- intNums: array (nullable = true) 329 | | | |-- element: integer (containsNull = true) 330 | | |-- errors: array (nullable = true) 331 | | | |-- element: struct (containsNull = true) 332 | | | | |-- errType: string (nullable = true) 333 | | | | |-- errCode: string (nullable = true) 334 | | | | |-- errMsg: string (nullable = true) 335 | | | | |-- errCol: string (nullable = true) 336 | | | | |-- rawValues: array (nullable = true) 337 | | | | | |-- element: string (containsNull = true) 338 | | | | |-- mappings: array (nullable = true) 339 | | | | | |-- element: struct (containsNull = true) 340 | | | | | | |-- mappingTableColumn: string (nullable = true) 341 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 342 | |""".stripMargin.replace("\r\n", "\n") 343 | val expectedResults = 344 | """[ { 345 | | "id" : 1, 346 | | "nums" : [ "7755", "a212", "222-111" ], 347 | | "intNums" : [ 7755, null, null ], 348 | | "errors" : [ { 349 | | "errType" : "myErrorType", 350 | | "errCode" : "E-1", 351 | | "errMsg" : "Testing This stuff", 352 | | "errCol" : "whatEvColumn", 353 | | "rawValues" : [ "some value" ], 354 | | "mappings" : [ ] 355 | | }, { 356 | | "errType" : "confCastError", 357 | | "errCode" : "E00003", 358 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 359 | | "errCol" : "intNums", 360 | | "rawValues" : [ "a212" ], 361 | | "mappings" : [ ] 362 | | }, { 363 | | "errType" : "confCastError", 364 | | "errCode" : "E00003", 365 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 366 | | "errCol" : "intNums", 367 | | "rawValues" : [ "222-111" ], 368 | | "mappings" : [ ] 369 | | } ] 370 | |}, { 371 | | "id" : 1, 372 | | "nums" : [ "223a", "223a", "775" ], 373 | | "intNums" : [ null, null, 775 ], 374 | | "errors" : [ { 375 | | "errType" : "confCastError", 376 | | "errCode" : "E00003", 377 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 378 | | "errCol" : "intNums", 379 | | "rawValues" : [ "223a" ], 380 | | "mappings" : [ ] 381 | | } ] 382 | |}, { 383 | | "id" : 1, 384 | | "nums" : [ "5", "-100", "9999999" ], 385 | | "intNums" : [ 5, -100, 9999999 ], 386 | | "errors" : [ ] 387 | |} ]""" 388 | .stripMargin.replace("\r\n", "\n") 389 | 390 | processCastExample(df, "nums", "intNums", expectedSchema, expectedResults) 391 | } 392 | 393 | test("Test casting of an array of array of primitives") { 394 | val df = spark.sparkContext.parallelize(arraysOfArraysOfPrimitivesSampleE).toDF 395 | 396 | val expectedSchema = 397 | """root 398 | | |-- id: integer (nullable = false) 399 | | |-- matrix: array (nullable = true) 400 | | | |-- element: array (containsNull = true) 401 | | | | |-- element: string (containsNull = true) 402 | | |-- intMatrix: array (nullable = true) 403 | | | |-- element: array (containsNull = true) 404 | | | | |-- element: integer (containsNull = true) 405 | | |-- errors: array (nullable = true) 406 | | | |-- element: struct (containsNull = true) 407 | | | | |-- errType: string (nullable = true) 408 | | | | |-- errCode: string (nullable = true) 409 | | | | |-- errMsg: string (nullable = true) 410 | | | | |-- errCol: string (nullable = true) 411 | | | | |-- rawValues: array (nullable = true) 412 | | | | | |-- element: string (containsNull = true) 413 | | | | |-- mappings: array (nullable = true) 414 | | | | | |-- element: struct (containsNull = true) 415 | | | | | | |-- mappingTableColumn: string (nullable = true) 416 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 417 | |""".stripMargin.replace("\r\n", "\n") 418 | val expectedResults = 419 | """[ { 420 | | "id" : 1, 421 | | "matrix" : [ [ "10", "11b" ], [ "11b", "12" ] ], 422 | | "intMatrix" : [ [ 10, null ], [ null, 12 ] ], 423 | | "errors" : [ { 424 | | "errType" : "myErrorType", 425 | | "errCode" : "E-1", 426 | | "errMsg" : "Testing This stuff", 427 | | "errCol" : "whatEvColumn", 428 | | "rawValues" : [ "some value" ], 429 | | "mappings" : [ ] 430 | | }, { 431 | | "errType" : "confCastError", 432 | | "errCode" : "E00003", 433 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 434 | | "errCol" : "intMatrix", 435 | | "rawValues" : [ "11b" ], 436 | | "mappings" : [ ] 437 | | } ] 438 | |}, { 439 | | "id" : 2, 440 | | "matrix" : [ [ "20f", "300" ], [ "1000", "10-10" ] ], 441 | | "intMatrix" : [ [ null, 300 ], [ 1000, null ] ], 442 | | "errors" : [ { 443 | | "errType" : "confCastError", 444 | | "errCode" : "E00003", 445 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 446 | | "errCol" : "intMatrix", 447 | | "rawValues" : [ "20f" ], 448 | | "mappings" : [ ] 449 | | }, { 450 | | "errType" : "confCastError", 451 | | "errCode" : "E00003", 452 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 453 | | "errCol" : "intMatrix", 454 | | "rawValues" : [ "10-10" ], 455 | | "mappings" : [ ] 456 | | } ] 457 | |}, { 458 | | "id" : 3, 459 | | "matrix" : [ [ "775", "223" ], [ "100", "0" ] ], 460 | | "intMatrix" : [ [ 775, 223 ], [ 100, 0 ] ], 461 | | "errors" : [ ] 462 | |} ]""" 463 | .stripMargin.replace("\r\n", "\n") 464 | 465 | processCastExample(df, "matrix", "intMatrix", expectedSchema, expectedResults) 466 | } 467 | 468 | test("Test casting of an array of struct of array of struct with error column") { 469 | val df = spark.sparkContext.parallelize(arraysOfStrtuctsDeepSampleE).toDF 470 | 471 | val expectedSchema = 472 | """root 473 | | |-- id: integer (nullable = false) 474 | | |-- legs: array (nullable = true) 475 | | | |-- element: struct (containsNull = false) 476 | | | | |-- legid: integer (nullable = true) 477 | | | | |-- conditions: array (nullable = true) 478 | | | | | |-- element: struct (containsNull = false) 479 | | | | | | |-- conif: string (nullable = true) 480 | | | | | | |-- conthen: string (nullable = true) 481 | | | | | | |-- amount: double (nullable = true) 482 | | | | | | |-- intConditionVal: integer (nullable = true) 483 | | |-- errors: array (nullable = true) 484 | | | |-- element: struct (containsNull = true) 485 | | | | |-- errType: string (nullable = true) 486 | | | | |-- errCode: string (nullable = true) 487 | | | | |-- errMsg: string (nullable = true) 488 | | | | |-- errCol: string (nullable = true) 489 | | | | |-- rawValues: array (nullable = true) 490 | | | | | |-- element: string (containsNull = true) 491 | | | | |-- mappings: array (nullable = true) 492 | | | | | |-- element: struct (containsNull = true) 493 | | | | | | |-- mappingTableColumn: string (nullable = true) 494 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 495 | |""".stripMargin.replace("\r\n", "\n") 496 | val expectedResults = 497 | """[ { 498 | | "id" : 1, 499 | | "legs" : [ { 500 | | "legid" : 100, 501 | | "conditions" : [ { 502 | | "conif" : "if bid>10", 503 | | "conthen" : "100", 504 | | "amount" : 100.0, 505 | | "intConditionVal" : 100 506 | | }, { 507 | | "conif" : "if sell<5", 508 | | "conthen" : "300a", 509 | | "amount" : 150.0 510 | | }, { 511 | | "conif" : "if sell<1", 512 | | "conthen" : "1000", 513 | | "amount" : 1000.0, 514 | | "intConditionVal" : 1000 515 | | } ] 516 | | }, { 517 | | "legid" : 101, 518 | | "conditions" : [ { 519 | | "conif" : "if bid<50", 520 | | "conthen" : "200", 521 | | "amount" : 200.0, 522 | | "intConditionVal" : 200 523 | | }, { 524 | | "conif" : "if sell>30", 525 | | "conthen" : "175b", 526 | | "amount" : 175.0 527 | | }, { 528 | | "conif" : "if sell>25", 529 | | "conthen" : "225-225", 530 | | "amount" : 225.0 531 | | } ] 532 | | } ], 533 | | "errors" : [ { 534 | | "errType" : "confCastError", 535 | | "errCode" : "E00003", 536 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 537 | | "errCol" : "legs.conditions.intConditionVal", 538 | | "rawValues" : [ "300a" ], 539 | | "mappings" : [ ] 540 | | }, { 541 | | "errType" : "confCastError", 542 | | "errCode" : "E00003", 543 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 544 | | "errCol" : "legs.conditions.intConditionVal", 545 | | "rawValues" : [ "175b" ], 546 | | "mappings" : [ ] 547 | | }, { 548 | | "errType" : "confCastError", 549 | | "errCode" : "E00003", 550 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 551 | | "errCol" : "legs.conditions.intConditionVal", 552 | | "rawValues" : [ "225-225" ], 553 | | "mappings" : [ ] 554 | | } ] 555 | |}, { 556 | | "id" : 2, 557 | | "legs" : [ { 558 | | "legid" : 102, 559 | | "conditions" : [ { 560 | | "conif" : "if bid>11", 561 | | "conthen" : "100", 562 | | "amount" : 100.0, 563 | | "intConditionVal" : 100 564 | | }, { 565 | | "conif" : "if sell<6", 566 | | "conthen" : "150", 567 | | "amount" : 150.0, 568 | | "intConditionVal" : 150 569 | | }, { 570 | | "conif" : "if sell<2", 571 | | "conthen" : "1000", 572 | | "amount" : 1000.0, 573 | | "intConditionVal" : 1000 574 | | } ] 575 | | }, { 576 | | "legid" : 103, 577 | | "conditions" : [ { 578 | | "conif" : "if bid<51", 579 | | "conthen" : "200", 580 | | "amount" : 200.0, 581 | | "intConditionVal" : 200 582 | | }, { 583 | | "conif" : "if sell>31", 584 | | "conthen" : "175", 585 | | "amount" : 175.0, 586 | | "intConditionVal" : 175 587 | | }, { 588 | | "conif" : "if sell>26", 589 | | "conthen" : "225", 590 | | "amount" : 225.0, 591 | | "intConditionVal" : 225 592 | | } ] 593 | | } ], 594 | | "errors" : [ ] 595 | |}, { 596 | | "id" : 3, 597 | | "legs" : [ { 598 | | "legid" : 104, 599 | | "conditions" : [ { 600 | | "conif" : "if bid>12", 601 | | "conthen" : "1OO", 602 | | "amount" : 100.0 603 | | }, { 604 | | "conif" : "if sell<7", 605 | | "conthen" : "150x", 606 | | "amount" : 150.0 607 | | }, { 608 | | "conif" : "if sell<3", 609 | | "conthen" : "-1000-", 610 | | "amount" : 1000.0 611 | | } ] 612 | | }, { 613 | | "legid" : 105, 614 | | "conditions" : [ { 615 | | "conif" : "if bid<52", 616 | | "conthen" : "2OO", 617 | | "amount" : 200.0 618 | | }, { 619 | | "conif" : "if sell>32", 620 | | "conthen" : "f175", 621 | | "amount" : 175.0 622 | | }, { 623 | | "conif" : "if sell>27", 624 | | "conthen" : "225_", 625 | | "amount" : 225.0 626 | | } ] 627 | | } ], 628 | | "errors" : [ { 629 | | "errType" : "myErrorType", 630 | | "errCode" : "E-1", 631 | | "errMsg" : "Testing This stuff", 632 | | "errCol" : "whatEvColumn", 633 | | "rawValues" : [ "some value" ], 634 | | "mappings" : [ ] 635 | | }, { 636 | | "errType" : "confCastError", 637 | | "errCode" : "E00003", 638 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 639 | | "errCol" : "legs.conditions.intConditionVal", 640 | | "rawValues" : [ "1OO" ], 641 | | "mappings" : [ ] 642 | | }, { 643 | | "errType" : "confCastError", 644 | | "errCode" : "E00003", 645 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 646 | | "errCol" : "legs.conditions.intConditionVal", 647 | | "rawValues" : [ "150x" ], 648 | | "mappings" : [ ] 649 | | }, { 650 | | "errType" : "confCastError", 651 | | "errCode" : "E00003", 652 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 653 | | "errCol" : "legs.conditions.intConditionVal", 654 | | "rawValues" : [ "-1000-" ], 655 | | "mappings" : [ ] 656 | | }, { 657 | | "errType" : "confCastError", 658 | | "errCode" : "E00003", 659 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 660 | | "errCol" : "legs.conditions.intConditionVal", 661 | | "rawValues" : [ "2OO" ], 662 | | "mappings" : [ ] 663 | | }, { 664 | | "errType" : "confCastError", 665 | | "errCode" : "E00003", 666 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 667 | | "errCol" : "legs.conditions.intConditionVal", 668 | | "rawValues" : [ "f175" ], 669 | | "mappings" : [ ] 670 | | }, { 671 | | "errType" : "confCastError", 672 | | "errCode" : "E00003", 673 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 674 | | "errCol" : "legs.conditions.intConditionVal", 675 | | "rawValues" : [ "225_" ], 676 | | "mappings" : [ ] 677 | | } ] 678 | |} ]""" 679 | .stripMargin.replace("\r\n", "\n") 680 | 681 | processCastExample(df, "legs.conditions.conthen", "legs.conditions.intConditionVal", 682 | expectedSchema, expectedResults) 683 | } 684 | 685 | test("Test casting of an array of struct of struct WITHOUT error column") { 686 | val df = spark.sparkContext.parallelize(arrayOfStructOfStruvtNoErrSampleE).toDF 687 | 688 | val expectedSchema = 689 | """root 690 | | |-- id: integer (nullable = false) 691 | | |-- employee: array (nullable = true) 692 | | | |-- element: struct (containsNull = false) 693 | | | | |-- name: string (nullable = true) 694 | | | | |-- address: struct (nullable = false) 695 | | | | | |-- city: string (nullable = true) 696 | | | | | |-- street: string (nullable = true) 697 | | | | | |-- buildingNum: integer (nullable = true) 698 | | | | | |-- zip: string (nullable = true) 699 | | | | | |-- intZip: integer (nullable = true) 700 | | |-- errors: array (nullable = true) 701 | | | |-- element: struct (containsNull = true) 702 | | | | |-- errType: string (nullable = true) 703 | | | | |-- errCode: string (nullable = true) 704 | | | | |-- errMsg: string (nullable = true) 705 | | | | |-- errCol: string (nullable = true) 706 | | | | |-- rawValues: array (nullable = true) 707 | | | | | |-- element: string (containsNull = true) 708 | | | | |-- mappings: array (nullable = true) 709 | | | | | |-- element: struct (containsNull = true) 710 | | | | | | |-- mappingTableColumn: string (nullable = true) 711 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 712 | |""".stripMargin.replace("\r\n", "\n") 713 | val expectedResults = 714 | """[ { 715 | | "id" : 1, 716 | | "employee" : [ { 717 | | "name" : "Martin", 718 | | "address" : { 719 | | "city" : "Olomuc", 720 | | "street" : "Vodickova", 721 | | "buildingNum" : 732, 722 | | "zip" : "73200", 723 | | "intZip" : 73200 724 | | } 725 | | }, { 726 | | "name" : "Stephan", 727 | | "address" : { 728 | | "city" : "Olomuc", 729 | | "street" : "Vodickova", 730 | | "buildingNum" : 77, 731 | | "zip" : "77-333" 732 | | } 733 | | } ], 734 | | "errors" : [ { 735 | | "errType" : "confCastError", 736 | | "errCode" : "E00003", 737 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 738 | | "errCol" : "employee.address.intZip", 739 | | "rawValues" : [ "77-333" ], 740 | | "mappings" : [ ] 741 | | } ] 742 | |}, { 743 | | "id" : 2, 744 | | "employee" : [ { 745 | | "name" : "Petr", 746 | | "address" : { 747 | | "city" : "Ostrava", 748 | | "street" : "Vlavska", 749 | | "buildingNum" : 25, 750 | | "zip" : "a9991" 751 | | } 752 | | }, { 753 | | "name" : "Michal", 754 | | "address" : { 755 | | "city" : "Ostrava", 756 | | "street" : "Vlavska", 757 | | "buildingNum" : 334, 758 | | "zip" : "552-aa1" 759 | | } 760 | | } ], 761 | | "errors" : [ { 762 | | "errType" : "confCastError", 763 | | "errCode" : "E00003", 764 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 765 | | "errCol" : "employee.address.intZip", 766 | | "rawValues" : [ "a9991" ], 767 | | "mappings" : [ ] 768 | | }, { 769 | | "errType" : "confCastError", 770 | | "errCode" : "E00003", 771 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 772 | | "errCol" : "employee.address.intZip", 773 | | "rawValues" : [ "552-aa1" ], 774 | | "mappings" : [ ] 775 | | } ] 776 | |}, { 777 | | "id" : 3, 778 | | "employee" : [ { 779 | | "name" : "Vojta", 780 | | "address" : { 781 | | "city" : "Plzen", 782 | | "street" : "Kralova", 783 | | "buildingNum" : 33, 784 | | "zip" : "993", 785 | | "intZip" : 993 786 | | } 787 | | } ], 788 | | "errors" : [ ] 789 | |} ]""" 790 | .stripMargin.replace("\r\n", "\n") 791 | 792 | processCastExample(df, "employee.address.zip", "employee.address.intZip", 793 | expectedSchema, expectedResults) 794 | } 795 | 796 | test ("Test multiple levels of nesting") { 797 | 798 | val sample = """[{"id":1,"legs":[{"legid":100,"conditions":[{"checks":[{"checkNums":["1","2","3b","4","5c","6"]}],"amount":100}]}]}]""" 799 | 800 | val df = JsonUtils.getDataFrameFromJson(spark, Seq(sample)) 801 | 802 | val expectedSchema = 803 | """root 804 | | |-- id: long (nullable = true) 805 | | |-- legs: array (nullable = true) 806 | | | |-- element: struct (containsNull = false) 807 | | | | |-- conditions: array (nullable = true) 808 | | | | | |-- element: struct (containsNull = false) 809 | | | | | | |-- amount: long (nullable = true) 810 | | | | | | |-- checks: array (nullable = true) 811 | | | | | | | |-- element: struct (containsNull = false) 812 | | | | | | | | |-- checkNums: array (nullable = true) 813 | | | | | | | | | |-- element: string (containsNull = true) 814 | | | | | | | | |-- optimizedNums: array (nullable = true) 815 | | | | | | | | | |-- element: integer (containsNull = true) 816 | | | | |-- legid: long (nullable = true) 817 | | |-- errors: array (nullable = true) 818 | | | |-- element: struct (containsNull = true) 819 | | | | |-- errType: string (nullable = true) 820 | | | | |-- errCode: string (nullable = true) 821 | | | | |-- errMsg: string (nullable = true) 822 | | | | |-- errCol: string (nullable = true) 823 | | | | |-- rawValues: array (nullable = true) 824 | | | | | |-- element: string (containsNull = true) 825 | | | | |-- mappings: array (nullable = true) 826 | | | | | |-- element: struct (containsNull = true) 827 | | | | | | |-- mappingTableColumn: string (nullable = true) 828 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 829 | |""".stripMargin.replace("\r\n", "\n") 830 | val expectedResults = 831 | """[ { 832 | | "id" : 1, 833 | | "legs" : [ { 834 | | "conditions" : [ { 835 | | "amount" : 100, 836 | | "checks" : [ { 837 | | "checkNums" : [ "1", "2", "3b", "4", "5c", "6" ], 838 | | "optimizedNums" : [ 1, 2, null, 4, null, 6 ] 839 | | } ] 840 | | } ], 841 | | "legid" : 100 842 | | } ], 843 | | "errors" : [ { 844 | | "errType" : "confCastError", 845 | | "errCode" : "E00003", 846 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 847 | | "errCol" : "legs.conditions.checks.optimizedNums", 848 | | "rawValues" : [ "3b" ], 849 | | "mappings" : [ ] 850 | | }, { 851 | | "errType" : "confCastError", 852 | | "errCode" : "E00003", 853 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 854 | | "errCol" : "legs.conditions.checks.optimizedNums", 855 | | "rawValues" : [ "5c" ], 856 | | "mappings" : [ ] 857 | | } ] 858 | |} ]""" 859 | .stripMargin.replace("\r\n", "\n") 860 | 861 | processCastExample(df, "legs.conditions.checks.checkNums", "legs.conditions.checks.optimizedNums", 862 | expectedSchema, expectedResults) 863 | } 864 | 865 | test ("Test combining fields on multiple levels of nesting") { 866 | 867 | val sample = """[{"id":1,"legs":[{"legid":100,"conditions":[{"checks":[{"checkNums":["1","2","3b","4","5c","6"]}],"amount":100}]}]}]""" 868 | 869 | val df = JsonUtils.getDataFrameFromJson(spark, Seq(sample)) 870 | 871 | val expectedSchema = 872 | """root 873 | | |-- id: long (nullable = true) 874 | | |-- legs: array (nullable = true) 875 | | | |-- element: struct (containsNull = false) 876 | | | | |-- conditions: array (nullable = true) 877 | | | | | |-- element: struct (containsNull = false) 878 | | | | | | |-- amount: long (nullable = true) 879 | | | | | | |-- checks: array (nullable = true) 880 | | | | | | | |-- element: struct (containsNull = false) 881 | | | | | | | | |-- checkNums: array (nullable = true) 882 | | | | | | | | | |-- element: string (containsNull = true) 883 | | | | | | | | |-- optimizedNums: array (nullable = true) 884 | | | | | | | | | |-- element: string (containsNull = true) 885 | | | | |-- legid: long (nullable = true) 886 | | |-- errors: array (nullable = true) 887 | | | |-- element: struct (containsNull = true) 888 | | | | |-- errType: string (nullable = true) 889 | | | | |-- errCode: string (nullable = true) 890 | | | | |-- errMsg: string (nullable = true) 891 | | | | |-- errCol: string (nullable = true) 892 | | | | |-- rawValues: array (nullable = true) 893 | | | | | |-- element: string (containsNull = true) 894 | | | | |-- mappings: array (nullable = true) 895 | | | | | |-- element: struct (containsNull = true) 896 | | | | | | |-- mappingTableColumn: string (nullable = true) 897 | | | | | | |-- mappedDatasetColumn: string (nullable = true) 898 | |""".stripMargin.replace("\r\n", "\n") 899 | 900 | val expectedResults = 901 | """[ { 902 | | "id" : 1, 903 | | "legs" : [ { 904 | | "conditions" : [ { 905 | | "amount" : 100, 906 | | "checks" : [ { 907 | | "checkNums" : [ "1", "2", "3b", "4", "5c", "6" ], 908 | | "optimizedNums" : [ "1_100_1", "2_100_1", "3b_100_1", "4_100_1", "5c_100_1", "6_100_1" ] 909 | | } ] 910 | | } ], 911 | | "legid" : 100 912 | | } ], 913 | | "errors" : [ { 914 | | "errType" : "confCastError", 915 | | "errCode" : "E00003", 916 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 917 | | "errCol" : "legs.conditions.checks.optimizedNums", 918 | | "rawValues" : [ "3b" ], 919 | | "mappings" : [ ] 920 | | }, { 921 | | "errType" : "confCastError", 922 | | "errCode" : "E00003", 923 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule", 924 | | "errCol" : "legs.conditions.checks.optimizedNums", 925 | | "rawValues" : [ "5c" ], 926 | | "mappings" : [ ] 927 | | } ] 928 | |} ]""" 929 | .stripMargin.replace("\r\n", "\n") 930 | 931 | val inputColumn = "legs.conditions.checks.checkNums" 932 | val outputColumn = "legs.conditions.checks.optimizedNums" 933 | val dfOut = NestedArrayTransformations.nestedExtendedWithColumnAndErrorMap(df, inputColumn, outputColumn, "errors", 934 | (_, gf) => { 935 | concat(gf(inputColumn), 936 | lit("_"), 937 | gf("legs.conditions.amount").cast(StringType), 938 | lit("_"), 939 | gf("id")) 940 | }, (c, gf) => { 941 | when(c.isNotNull.and(c.cast(IntegerType).isNull) 942 | .and(gf("legs.conditions.amount") === 100) 943 | .and(gf("legs.legid") === 100) 944 | .and(gf("id") === 1), 945 | callUDF("confCastErr", lit(outputColumn), gf(inputColumn).cast(StringType))) 946 | .otherwise(null) 947 | }) 948 | 949 | val actualSchema = dfOut.schema.treeString 950 | val actualResults = JsonUtils.prettySparkJSON(dfOut.toJSON.collect) 951 | 952 | assertSchema(actualSchema, expectedSchema) 953 | assertResults(actualResults, expectedResults) 954 | } 955 | 956 | test ("Test deep array transformations unhappy paths") { 957 | val df = spark.sparkContext.parallelize(Seq(1,2,3,4,5)).toDF() 958 | 959 | assert(intercept[IllegalArgumentException] { 960 | NestedArrayTransformations.nestedWithColumnAndErrorMap(df, "value", "value2", "err.errors", c => c, e => e) 961 | }.getMessage contains "Error columns should be at the root schema level") 962 | 963 | assert(intercept[IllegalArgumentException] { 964 | NestedArrayTransformations.nestedWithColumnMap(df, "value.foo", "value.foo2", c => c) 965 | }.getMessage contains "Field 'value' is not a struct type or an array") 966 | 967 | assert(intercept[IllegalArgumentException] { 968 | NestedArrayTransformations.nestedWithColumnMap(df, "value", "", _ => lit("foo")).printSchema() 969 | }.getMessage contains "Output field cannot be empty") 970 | 971 | assert(intercept[IllegalArgumentException] { 972 | df.nestedWithColumn("value", lit("foo")).printSchema() 973 | }.getMessage contains "The column 'value' already exists") 974 | 975 | } 976 | 977 | test("Test array_distinct() from Spark API (didn't work in 2.4.0, fixed in 2.4.1)"){ 978 | val sourceData = 979 | """{ 980 | | "id": 3, 981 | | "MyLiteral": "abcdef", 982 | | "MyUpperLiteral": "ABCDEF", 983 | | "errCol": [ 984 | | { 985 | | "errType": "confMapError", 986 | | "errCode": "E00001", 987 | | "errMsg": "Conformance Error - Null produced by mapping conformance rule", 988 | | "errCol": "legs.conditions.conformed_country", 989 | | "rawValues": [ 990 | | "SWE" 991 | | ], 992 | | "mappings": [ 993 | | { 994 | | "mappingTableColumn": "country_code", 995 | | "mappedDatasetColumn": "legs.conditions.country" 996 | | } 997 | | ] 998 | | }, 999 | | { 1000 | | "errType": "confMapError", 1001 | | "errCode": "E00001", 1002 | | "errMsg": "Conformance Error - Null produced by mapping conformance rule", 1003 | | "errCol": "legs.conditions.conformed_country", 1004 | | "rawValues": [ 1005 | | "SWE" 1006 | | ], 1007 | | "mappings": [ 1008 | | { 1009 | | "mappingTableColumn": "country_code", 1010 | | "mappedDatasetColumn": "legs.conditions.country" 1011 | | } 1012 | | ] 1013 | | }, 1014 | | { 1015 | | "errType": "confMapError", 1016 | | "errCode": "E00001", 1017 | | "errMsg": "Conformance Error - Null produced by mapping conformance rule", 1018 | | "errCol": "legs.conditions.conformed_currency", 1019 | | "rawValues": [ 1020 | | "Dummy" 1021 | | ], 1022 | | "mappings": [ 1023 | | { 1024 | | "mappingTableColumn": "currency_code", 1025 | | "mappedDatasetColumn": "legs.conditions.currency" 1026 | | } 1027 | | ] 1028 | | } 1029 | | ], 1030 | | "legs": [ 1031 | | { 1032 | | "conditions": [ 1033 | | { 1034 | | "checks": [], 1035 | | "country": "SWE", 1036 | | "currency": "SWK", 1037 | | "product": "Stock", 1038 | | "conformed_currency": "SEK", 1039 | | "conformed_product": "STK" 1040 | | } 1041 | | ], 1042 | | "legid": 300 1043 | | }, 1044 | | { 1045 | | "conditions": [ 1046 | | { 1047 | | "checks": [], 1048 | | "country": "SA", 1049 | | "currency": "Dummy", 1050 | | "product": "Bond", 1051 | | "conformed_country": "South Africa", 1052 | | "conformed_currency": "Unknown", 1053 | | "conformed_product": "BND" 1054 | | } 1055 | | ], 1056 | | "legid": 301 1057 | | } 1058 | | ] 1059 | |}""".stripMargin 1060 | 1061 | val expectedDistinct = 1062 | """{ 1063 | | "MyLiteral" : "abcdef", 1064 | | "errCol" : [ { 1065 | | "errCode" : "E00001", 1066 | | "errCol" : "legs.conditions.conformed_country", 1067 | | "errMsg" : "Conformance Error - Null produced by mapping conformance rule", 1068 | | "errType" : "confMapError", 1069 | | "mappings" : [ { 1070 | | "mappedDatasetColumn" : "legs.conditions.country", 1071 | | "mappingTableColumn" : "country_code" 1072 | | } ], 1073 | | "rawValues" : [ "SWE" ] 1074 | | }, { 1075 | | "errCode" : "E00001", 1076 | | "errCol" : "legs.conditions.conformed_currency", 1077 | | "errMsg" : "Conformance Error - Null produced by mapping conformance rule", 1078 | | "errType" : "confMapError", 1079 | | "mappings" : [ { 1080 | | "mappedDatasetColumn" : "legs.conditions.currency", 1081 | | "mappingTableColumn" : "currency_code" 1082 | | } ], 1083 | | "rawValues" : [ "Dummy" ] 1084 | | } ] 1085 | |}""".stripMargin.replace("\r\n", "\n") 1086 | 1087 | val df = JsonUtils.getDataFrameFromJson(spark, Seq(sourceData)) 1088 | 1089 | val dfDistinct = df.select(col("MyLiteral"), array_distinct(col("errCol")).as("errCol")) 1090 | 1091 | val actualDistinct = JsonUtils.prettyJSON(dfDistinct.toJSON.take(1)(0)) 1092 | 1093 | assert(actualDistinct == expectedDistinct) 1094 | } 1095 | 1096 | private def processCastExample(df: DataFrame, inputColumn: String, outputColumn: String, expectedSchema: String, 1097 | expectedResults: String): Unit = { 1098 | val dfOut = NestedArrayTransformations.nestedWithColumnAndErrorMap(df, inputColumn, outputColumn, "errors", 1099 | c => { 1100 | c.cast(IntegerType) 1101 | }, c => { 1102 | when(c.isNotNull.and(c.cast(IntegerType).isNull), 1103 | callUDF("confCastErr", lit(outputColumn), c.cast(StringType))) 1104 | .otherwise(null) 1105 | }) 1106 | 1107 | val actualSchema = dfOut.schema.treeString 1108 | val actualResults = JsonUtils.prettySparkJSON(dfOut.toJSON.collect) 1109 | 1110 | assertSchema(actualSchema, expectedSchema) 1111 | assertResults(actualResults, expectedResults) 1112 | } 1113 | 1114 | private def assertSchema(actualSchema: String, expectedSchema: String): Unit = { 1115 | if (actualSchema != expectedSchema) { 1116 | log.error("EXPECTED:") 1117 | log.error(expectedSchema) 1118 | log.error("ACTUAL:") 1119 | log.error(actualSchema) 1120 | fail("Actual conformed schema does not match the expected schema (see above).") 1121 | } 1122 | } 1123 | 1124 | private def assertResults(actualResults: String, expectedResults: String): Unit = { 1125 | if (actualResults != expectedResults) { 1126 | log.error("EXPECTED:") 1127 | log.error(expectedResults) 1128 | log.error("ACTUAL:") 1129 | log.error(actualResults) 1130 | fail("Actual conformed dataset JSON does not match the expected JSON (see above).") 1131 | } 1132 | } 1133 | } 1134 | 1135 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/ExtendedTransformationsSuite.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations 18 | 19 | import org.apache.commons.io.IOUtils 20 | import org.apache.spark.sql.functions._ 21 | import org.apache.spark.sql.types.StringType 22 | import org.scalatest.funsuite.AnyFunSuite 23 | import org.slf4j.LoggerFactory 24 | import za.co.absa.spark.hats.SparkTestBase 25 | import za.co.absa.spark.hats.transformations.samples.{ErrorMessage, NestedTestCaseFactory, SampleErrorUDFs} 26 | import za.co.absa.spark.hats.utils.JsonUtils 27 | 28 | class ExtendedTransformationsSuite extends AnyFunSuite with SparkTestBase { 29 | implicit val _: SampleErrorUDFs = new SampleErrorUDFs 30 | 31 | private val log = LoggerFactory.getLogger(this.getClass) 32 | private val nestedTestCaseFactory = new NestedTestCaseFactory() 33 | 34 | test("Test extended array transformations work on root level fields") { 35 | val expectedSchema = getResourceString("/test_data/nested/nested1Schema.txt") 36 | val expectedResults = getResourceString("/test_data/nested/nested1Results.json") 37 | 38 | val df = nestedTestCaseFactory.getTestCase 39 | 40 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "", "id_str", (_, gf) => 41 | concat(gf("id"), lit(" "), gf("key1").cast(StringType), lit(" "), gf("key2")) 42 | ) 43 | 44 | val actualSchema = dfOut.schema.treeString 45 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 46 | 47 | assertSchema(actualSchema, expectedSchema) 48 | assertResults(actualResults, expectedResults) 49 | } 50 | 51 | test("Test extended array transformations work on an inner struct level fields") { 52 | val expectedSchema = getResourceString("/test_data/nested/nested2Schema.txt") 53 | val expectedResults = getResourceString("/test_data/nested/nested2Results.json") 54 | 55 | val df = nestedTestCaseFactory.getTestCase 56 | 57 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "struct2", "skey2", (c, gf) => 58 | concat(gf("key1"), lit(" "), gf("struct2.inner1.key5").cast(StringType), lit(" "), 59 | c.getField("inner1").getField("key6")) 60 | ).select("key1", "struct2") 61 | 62 | val actualSchema = dfOut.schema.treeString 63 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 64 | 65 | assertSchema(actualSchema, expectedSchema) 66 | assertResults(actualResults, expectedResults) 67 | } 68 | 69 | test("Test extended array transformations work on a double nested inner struct level fields") { 70 | val expectedSchema = getResourceString("/test_data/nested/nested3Schema.txt") 71 | val expectedResults = getResourceString("/test_data/nested/nested3Results.json") 72 | 73 | val df = nestedTestCaseFactory.getTestCase 74 | 75 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "struct2.inner1", "skey2", (c, gf) => 76 | concat(gf("key1"), lit(" "), gf("struct2.inner1.key5").cast(StringType), lit(" "), c.getField("key6")) 77 | ).select("key1", "struct2") 78 | 79 | val actualSchema = dfOut.schema.treeString 80 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 81 | 82 | assertSchema(actualSchema, expectedSchema) 83 | assertResults(actualResults, expectedResults) 84 | } 85 | 86 | test("Test extended array transformations work on a nested struct in an array") { 87 | val expectedSchema = getResourceString("/test_data/nested/nested4Schema.txt") 88 | val expectedResults = getResourceString("/test_data/nested/nested4Results.json") 89 | 90 | val df = nestedTestCaseFactory.getTestCase 91 | 92 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array1", "skey3", (c, gf) => 93 | concat(gf("key1"), lit(" "), gf("array1.key7").cast(StringType), lit(" "), c.getField("key8")) 94 | ).select("key1", "array1") 95 | 96 | val actualSchema = dfOut.schema.treeString 97 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 98 | 99 | assertSchema(actualSchema, expectedSchema) 100 | assertResults(actualResults, expectedResults) 101 | } 102 | 103 | test("Test extended array transformations work on a nested struct in an array of an array") { 104 | val expectedSchema = getResourceString("/test_data/nested/nested5Schema.txt") 105 | val expectedResults = getResourceString("/test_data/nested/nested5Results.json") 106 | 107 | val df = nestedTestCaseFactory.getTestCase 108 | 109 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array2.inner2", "out", (c, gf) => 110 | concat(gf("key1"), 111 | lit(" "), 112 | gf("array2.key2").cast(StringType), 113 | lit(" "), 114 | gf("array2.inner2.key9"), 115 | lit(" "), 116 | c.getField("key10")) 117 | ).select("key1", "array2") 118 | 119 | val actualSchema = dfOut.schema.treeString 120 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 121 | 122 | assertSchema(actualSchema, expectedSchema) 123 | assertResults(actualResults, expectedResults) 124 | } 125 | 126 | test("Test extended array transformations work if a nested struct in an array is accessed") { 127 | val expectedSchema = getResourceString("/test_data/nested/nested6Schema.txt") 128 | val expectedResults = getResourceString("/test_data/nested/nested6Results.json") 129 | 130 | val df = nestedTestCaseFactory.getTestCase 131 | 132 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array2.inner2", "out", (c, gf) => 133 | concat(c.getField("key10"), 134 | lit(" "), 135 | gf("array2.inner2.struct3.k1").cast(StringType)) 136 | ).select("array2") 137 | 138 | val actualSchema = dfOut.schema.treeString 139 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 140 | 141 | assertSchema(actualSchema, expectedSchema) 142 | assertResults(actualResults, expectedResults) 143 | } 144 | 145 | test("Test extended array transformations work for a nested struct in an array is accessed") { 146 | val expectedSchema = getResourceString("/test_data/nested/nested7Schema.txt") 147 | val expectedResults = getResourceString("/test_data/nested/nested7Results.json") 148 | 149 | val df = nestedTestCaseFactory.getTestCase 150 | 151 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array2.inner2.struct3", "out", (c, gf) => 152 | concat(c.getField("k1"), 153 | lit(" "), 154 | gf("array2.inner2.key10").cast(StringType)) 155 | ).select("array2") 156 | 157 | val actualSchema = dfOut.schema.treeString 158 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 159 | 160 | assertSchema(actualSchema, expectedSchema) 161 | assertResults(actualResults, expectedResults) 162 | } 163 | 164 | test("Test extended array transformations with error column work if a nested struct in an array is accessed") { 165 | val expectedSchema = getResourceString("/test_data/nested/nested8Schema.txt") 166 | val expectedResults = getResourceString("/test_data/nested/nested8Results.json") 167 | 168 | val df = nestedTestCaseFactory.getTestCase 169 | 170 | val dfOut = NestedArrayTransformations.nestedExtendedStructAndErrorMap(df, "array2.inner2", "out", "errCol", (c, gf) => 171 | concat(c.getField("key10"), 172 | lit(" "), 173 | gf("array2.inner2.struct3.k1").cast(StringType)) 174 | , 175 | (_, gf) => { 176 | when(gf("array2.inner2.struct3.k1") =!= 1, 177 | callUDF("confCastErr", lit("k1!==1"), gf("array2.inner2.struct3.k1").cast(StringType)) 178 | ).otherwise(null) 179 | } 180 | ).select("array2", "errCol") 181 | 182 | val actualSchema = dfOut.schema.treeString 183 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 184 | 185 | assertSchema(actualSchema, expectedSchema) 186 | assertResults(actualResults, expectedResults) 187 | } 188 | 189 | test("Test extended array transformations with error column that has existing errors") { 190 | val expectedSchema = getResourceString("/test_data/nested/nested9Schema.txt") 191 | val expectedResults = getResourceString("/test_data/nested/nested9Results.json") 192 | 193 | val df = nestedTestCaseFactory 194 | .getTestCase 195 | .withColumn("errCol", array(typedLit(ErrorMessage("Initial", "000", "ErrMsg", "id", Seq(), Seq())))) 196 | 197 | val dfOut = NestedArrayTransformations.nestedExtendedStructAndErrorMap(df, "array2.inner2", "out", "errCol", (c, gf) => 198 | concat(c.getField("key10"), 199 | lit(" "), 200 | gf("array2.inner2.struct3.k1").cast(StringType)) 201 | , 202 | (_, gf) => { 203 | when(gf("array2.inner2.struct3.k1") =!= 1, 204 | callUDF("confCastErr", lit("k1!==1"), gf("array2.inner2.struct3.k1").cast(StringType)) 205 | ).otherwise(null) 206 | } 207 | ).select("array2", "errCol") 208 | 209 | val actualSchema = dfOut.schema.treeString 210 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 211 | 212 | assertSchema(actualSchema, expectedSchema) 213 | assertResults(actualResults, expectedResults) 214 | } 215 | 216 | test("Test extended array transformations with error column for an array inside a double nested struct") { 217 | val expectedSchema = getResourceString("/test_data/nested/nested10Schema.txt") 218 | val expectedResults = getResourceString("/test_data/nested/nested10Results.json") 219 | 220 | val df = nestedTestCaseFactory 221 | .getTestCase 222 | .withColumn("errCol", array(typedLit(ErrorMessage("Initial", "000", "ErrMsg", "id", Seq(), Seq())))) 223 | 224 | val dfOut = NestedArrayTransformations.nestedExtendedStructAndErrorMap(df, "struct3.inner3.array3", 225 | "struct3.inner3.array3.out", "errCol", (c, gf) => 226 | concat(c.getField("a1"), 227 | lit(" "), 228 | gf("struct3.inner3.array3.a2").cast(StringType)) 229 | , 230 | (_, gf) => { 231 | when(gf("struct3.inner3.array3.a1") =!= 3, 232 | callUDF("confCastErr", lit("a1!==3"), gf("struct3.inner3.array3.a1").cast(StringType)) 233 | ).otherwise(null) 234 | } 235 | ).select("struct3", "errCol") 236 | 237 | val actualSchema = dfOut.schema.treeString 238 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect()) 239 | 240 | assertSchema(actualSchema, expectedSchema) 241 | assertResults(actualResults, expectedResults) 242 | } 243 | 244 | private def getResourceString(name: String): String = 245 | IOUtils.toString(getClass.getResourceAsStream(name), "UTF-8") 246 | 247 | private def assertSchema(actualSchema: String, expectedSchema: String): Unit = { 248 | if (actualSchema != expectedSchema) { 249 | log.error("EXPECTED:") 250 | log.error(expectedSchema) 251 | log.error("ACTUAL:") 252 | log.error(actualSchema) 253 | fail("Actual conformed schema does not match the expected schema (see above).") 254 | } 255 | } 256 | 257 | private def assertResults(actualResults: String, expectedResults: String): Unit = { 258 | if (!expectedResults.startsWith(actualResults)) { 259 | log.error("EXPECTED:") 260 | log.error(expectedResults) 261 | log.error("ACTUAL:") 262 | log.error(actualResults) 263 | fail("Actual conformed dataset JSON does not match the expected JSON (see above).") 264 | } 265 | } 266 | } 267 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/samples/DeepArraySamples.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations.samples 18 | 19 | // Examples for constructing dataframes containing arrays of various levels of nesting 20 | // Also it includes error column to test transformations that can fail per field 21 | 22 | // The case classes were declared at the package level so it can be used to create Spark DataSets 23 | // It is declared package private so the names won't pollute public/exported namespace 24 | 25 | // Structs of Structs example 26 | private[transformations] case class Address(city: String, street: String) 27 | 28 | private[transformations] case class Employee(name: String, address: Address) 29 | 30 | private[transformations] case class TestObj(id: Int, employee: Employee) 31 | 32 | private[transformations] case class TestObj2(id: Int, employee: Seq[Employee]) 33 | 34 | // Arrays of primitives example 35 | private[transformations] case class FunWords(id: Int, words: Seq[String]) 36 | 37 | // Arrays of arrays of primitives example 38 | private[transformations] case class GeoData(id: Int, matrix: Seq[Seq[String]]) 39 | 40 | // Arrays of structs example 41 | private[transformations] case class Person(firstName: String, lastName: String) 42 | 43 | private[transformations] case class Team(id: Int, person: Seq[Person]) 44 | 45 | private[transformations] case class Dept(name: String, team: Team) 46 | 47 | // Arrays of Arrays of struct 48 | private[transformations] case class Tournament(id: Int, person: Seq[Seq[Person]]) 49 | 50 | // Arrays of structs in arrays of structs 51 | private[transformations] case class Condition(conif: String, conthen: String, amount: Double) 52 | 53 | private[transformations] case class Leg(legid: Int, conditions: Seq[Condition]) 54 | 55 | private[transformations] case class Trade(id: Int, legs: Seq[Leg]) 56 | 57 | // Structs of Structs example with errror column 58 | private[transformations] case class AddressWithErrColumn(city: String, street: String, buildingNum: Int, zip: String, 59 | errors: Seq[ErrorMessage]) 60 | 61 | private[transformations] case class AddressNoErrColumn(city: String, street: String, buildingNum: Int, zip: String) 62 | 63 | private[transformations] case class EmployeeNoErrorColumn(name: String, address: AddressNoErrColumn) 64 | 65 | private[transformations] case class TestObj1WithErrorColumn(id: Int, employee: EmployeeNoErrorColumn, errors: 66 | Seq[ErrorMessage]) 67 | 68 | private[transformations] case class TestObj2WithErrorColumn(id: Int, employee: Seq[EmployeeNoErrorColumn], 69 | errors: Seq[ErrorMessage]) 70 | 71 | private[transformations] case class TestObj2NoErrColumn(id: Int, employee: Seq[EmployeeNoErrorColumn]) 72 | 73 | // Arrays of primitives example with error column 74 | private[transformations] case class FunNumbersWithErrorColumn(id: Int, nums: Seq[String], errors: Seq[ErrorMessage]) 75 | 76 | // Arrays of arrays of primitives example with error column 77 | private[transformations] case class GeoDataWithErrorColumn(id: Int, matrix: Seq[Seq[String]], errors: Seq[ErrorMessage]) 78 | 79 | // Arrays of structs in arrays of structs with error column 80 | private[transformations] case class TradeWithErrorColumn(id: Int, legs: Seq[Leg], errors: Seq[ErrorMessage]) 81 | 82 | object DeepArraySamples { 83 | // scalastyle:off magic.number 84 | // scalastyle:off line.size.limit 85 | 86 | // WITHOUT error column 87 | 88 | // Plain 89 | val plainSampleN: Seq[Address] = Seq( 90 | Address("Olomuc", "Vodickova"), 91 | Address("Ostrava", "Vlavska"), 92 | Address("Plzen", "Kralova") 93 | ) 94 | 95 | // Struct of struct 96 | val structOfStructSampleN: Seq[TestObj] = Seq( 97 | TestObj(1, Employee("Martin", Address("Olomuc", "Vodickova"))), 98 | TestObj(1, Employee("Petr", Address("Ostrava", "Vlavska"))), 99 | TestObj(1, Employee("Vojta", Address("Plzen", "Kralova"))) 100 | ) 101 | 102 | // Array of struct of struct 103 | val arrayOfstructOfStructSampleN: Seq[TestObj2] = Seq( 104 | TestObj2(1, Seq(Employee("Martin", Address("Olomuc", "Vodickova")), Employee("Stephan", Address("Olomuc", "Vodickova")))), 105 | TestObj2(2, Seq(Employee("Petr", Address("Ostrava", "Vlavska")), Employee("Michal", Address("Ostrava", "Vlavska")))), 106 | TestObj2(3, Seq(Employee("Vojta", Address("Plzen", "Kralova")))) 107 | ) 108 | 109 | // Arrays of primitives 110 | val arraysOfPrimitivesSampleN: Seq[FunWords] = Seq( 111 | FunWords(1, Seq("Gizmo", "Blurp", "Buzinga")), 112 | FunWords(1, Seq("Quirk", "Zap", "Mmrnmhrm")) 113 | ) 114 | 115 | // Arrays of arrays of primitives 116 | val arraysOfArraysOfPrimitivesSampleN: Seq[GeoData] = Seq( 117 | GeoData(1, Seq(Seq("Tree", "Table"), Seq("Map", "Duck"))), 118 | GeoData(2, Seq(Seq("Apple", "Machine"), Seq("List", "Duck"))), 119 | GeoData(3, Seq(Seq("Computer", "Snake"), Seq("Sun", "Star"))) 120 | ) 121 | 122 | // Arrays of structs 123 | val arraysOfStructsSampleN: Seq[Team] = Seq( 124 | Team(1, Seq(Person("John", "Smith"), Person("Jack", "Brown"))), 125 | Team(1, Seq(Person("Merry", "Cook"), Person("Jane", "Clark"))) 126 | ) 127 | 128 | // Arrays of arrays of struct 129 | val arraysOfArraysOfStructSampleN: Seq[Tournament] = Seq( 130 | Tournament(1, Seq(Seq(Person("Mona Lisa", "Harddrive")), Seq(Person("Lenny", "Linux"), Person("Dot", "Not")))), 131 | Tournament(1, Seq(Seq(Person("Eddie", "Larrison")), Seq(Person("Scarlett", "Johanson"), Person("William", "Windows")))) 132 | ) 133 | 134 | // Arrays of struct with arrays of struct 135 | val arraysOfStrtuctsDeepSampleN: Seq[Trade] = Seq( 136 | Trade(1, Seq( 137 | Leg(100, Seq( 138 | Condition("if bid>10", "buy", 100), Condition("if sell<5", "sell", 150), Condition("if sell<1", "sell", 1000))), 139 | Leg(101, Seq( 140 | Condition("if bid<50", "sell", 200), Condition("if sell>30", "buy", 175), Condition("if sell>25", "buy", 225))) 141 | )), 142 | Trade(2, Seq( 143 | Leg(102, Seq( 144 | Condition("if bid>11", "buy", 100), Condition("if sell<6", "sell", 150), Condition("if sell<2", "sell", 1000))), 145 | Leg(103, Seq( 146 | Condition("if bid<51", "sell", 200), Condition("if sell>31", "buy", 175), Condition("if sell>26", "buy", 225))) 147 | )), 148 | Trade(3, Seq( 149 | Leg(104, Seq( 150 | Condition("if bid>12", "buy", 100), Condition("if sell<7", "sell", 150), Condition("if sell<3", "sell", 1000))), 151 | Leg(105, Seq( 152 | Condition("if bid<52", "sell", 200), Condition("if sell>32", "buy", 175), Condition("if sell>27", "buy", 225))) 153 | )) 154 | ) 155 | 156 | // WITH error column 157 | 158 | // Plain 159 | val plainSampleE: Seq[AddressWithErrColumn] = Seq( 160 | AddressWithErrColumn("Olomuc", "Vodickova", 12, "12000", Seq()), 161 | AddressWithErrColumn("Ostrava", "Vlavska", 110, "1455a", Seq()), 162 | AddressWithErrColumn("Plzen", "Kralova", 71, "b881", 163 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))) 164 | ) 165 | 166 | // Struct of struct 167 | val structOfStructSampleE: Seq[TestObj1WithErrorColumn] = Seq( 168 | TestObj1WithErrorColumn(1, EmployeeNoErrorColumn("Martin", AddressNoErrColumn("Olomuc", "Vodickova", 12, "12000")), Nil), 169 | TestObj1WithErrorColumn(1, EmployeeNoErrorColumn("Petr", AddressNoErrColumn("Ostrava", "Vlavska", 110, "1455a")), 170 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))), 171 | TestObj1WithErrorColumn(1, EmployeeNoErrorColumn("Vojta", AddressNoErrColumn("Plzen", "Kralova", 71, "b881")), Nil) 172 | ) 173 | 174 | // Array of struct of struct 175 | val arrayOfStructOfStructErrSampleE: Seq[TestObj2WithErrorColumn] = Seq( 176 | TestObj2WithErrorColumn(1, Seq( 177 | EmployeeNoErrorColumn("Martin", AddressNoErrColumn("Olomuc", "Vodickova", 732, "73200")), 178 | EmployeeNoErrorColumn("Stephan", AddressNoErrColumn("Olomuc", "Vodickova", 77, "77-333"))), Nil), 179 | TestObj2WithErrorColumn(2, Seq( 180 | EmployeeNoErrorColumn("Petr", AddressNoErrColumn("Ostrava", "Vlavska", 25, "a9991")), 181 | EmployeeNoErrorColumn("Michal", AddressNoErrColumn("Ostrava", "Vlavska", 334, "552-aa1"))), 182 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))), 183 | TestObj2WithErrorColumn(3, Seq( 184 | EmployeeNoErrorColumn("Vojta", AddressNoErrColumn("Plzen", "Kralova", 33, "993"))), Nil) 185 | ) 186 | 187 | val arrayOfStructOfStruvtNoErrSampleE: Seq[TestObj2NoErrColumn] = Seq( 188 | TestObj2NoErrColumn(1, Seq( 189 | EmployeeNoErrorColumn("Martin", AddressNoErrColumn("Olomuc", "Vodickova", 732, "73200")), 190 | EmployeeNoErrorColumn("Stephan", AddressNoErrColumn("Olomuc", "Vodickova", 77, "77-333")))), 191 | TestObj2NoErrColumn(2, Seq( 192 | EmployeeNoErrorColumn("Petr", AddressNoErrColumn("Ostrava", "Vlavska", 25, "a9991")), 193 | EmployeeNoErrorColumn("Michal", AddressNoErrColumn("Ostrava", "Vlavska", 334, "552-aa1")))), 194 | TestObj2NoErrColumn(3, Seq( 195 | EmployeeNoErrorColumn("Vojta", AddressNoErrColumn("Plzen", "Kralova", 33, "993")))) 196 | ) 197 | 198 | // Arrays of primitives 199 | val arraysOfPrimitivesSampleE: Seq[FunNumbersWithErrorColumn] = Seq( 200 | FunNumbersWithErrorColumn(1, Seq("7755", "a212", "222-111"), 201 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))), 202 | FunNumbersWithErrorColumn(1, Seq("223a", "223a", "775"), Nil), 203 | FunNumbersWithErrorColumn(1, Seq("5", "-100", "9999999"), Nil) 204 | ) 205 | 206 | // Arrays of arrays of primitives 207 | val arraysOfArraysOfPrimitivesSampleE: Seq[GeoDataWithErrorColumn] = Seq( 208 | GeoDataWithErrorColumn(1, Seq(Seq("10", "11b"), Seq("11b", "12")), 209 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))), 210 | GeoDataWithErrorColumn(2, Seq(Seq("20f", "300"), Seq("1000", "10-10")), Nil), 211 | GeoDataWithErrorColumn(3, Seq(Seq("775", "223"), Seq("100", "0")), Nil) 212 | ) 213 | 214 | // Arrays of struct with arrays of struct 215 | val arraysOfStrtuctsDeepSampleE: Seq[TradeWithErrorColumn] = Seq( 216 | TradeWithErrorColumn(1, Seq( 217 | Leg(100, Seq( 218 | Condition("if bid>10", "100", 100), Condition("if sell<5", "300a", 150), Condition("if sell<1", "1000", 1000))), 219 | Leg(101, Seq( 220 | Condition("if bid<50", "200", 200), Condition("if sell>30", "175b", 175), Condition("if sell>25", "225-225", 225))) 221 | ), Nil), 222 | TradeWithErrorColumn(2, Seq( 223 | Leg(102, Seq( 224 | Condition("if bid>11", "100", 100), Condition("if sell<6", "150", 150), Condition("if sell<2", "1000", 1000))), 225 | Leg(103, Seq( 226 | Condition("if bid<51", "200", 200), Condition("if sell>31", "175", 175), Condition("if sell>26", "225", 225))) 227 | ), Nil), 228 | TradeWithErrorColumn(3, Seq( 229 | Leg(104, Seq( 230 | Condition("if bid>12", "1OO", 100), Condition("if sell<7", "150x", 150), Condition("if sell<3", "-1000-", 1000))), 231 | Leg(105, Seq( 232 | Condition("if bid<52", "2OO", 200), Condition("if sell>32", "f175", 175), Condition("if sell>27", "225_", 225))) 233 | ), Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))) 234 | ) 235 | } 236 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/samples/ErrorMessage.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations.samples 18 | 19 | /** 20 | * Case class to represent an error message 21 | * 22 | * @param errType - Type or source of the error 23 | * @param errCode - Internal error code 24 | * @param errMsg - Textual description of the error 25 | * @param errCol - The name of the column where the error occurred 26 | * @param rawValues - Sequence of raw values (which are the potential culprits of the error) 27 | * @param mappings - Sequence of Mappings i.e Mapping Table Column -> Equivalent Mapped Dataset column 28 | */ 29 | case class ErrorMessage(errType: String, errCode: String, errMsg: String, errCol: String, rawValues: Seq[String], mappings: Seq[Mapping] = Seq()) 30 | case class Mapping(mappingTableColumn: String, mappedDatasetColumn: String) 31 | 32 | object ErrorMessage { 33 | val errorColumnName = "errCol" 34 | 35 | def confCastErr(errCol: String, rawValue: String): ErrorMessage = ErrorMessage( 36 | errType = "confCastError", 37 | errCode = "E00003", 38 | errMsg = "Conformance Error - Null returned by casting conformance rule", 39 | errCol = errCol, 40 | rawValues = Seq(rawValue)) 41 | 42 | } 43 | 44 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/samples/NestedMapTestCaseFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations.samples 18 | 19 | import org.apache.spark.sql.types._ 20 | import org.apache.spark.sql.{DataFrame, Row, SparkSession, types} 21 | 22 | class NestedMapTestCaseFactory(implicit spark: SparkSession) { 23 | 24 | private val testCaseSchema = new StructType() 25 | .add("name", StringType) 26 | .add("addresses", ArrayType(new StructType() 27 | .add("city", StringType) 28 | .add("state", StringType))) 29 | .add("properties", MapType(StringType, StringType)) 30 | 31 | private val tstCaseData = Seq( 32 | Row("John", List(Row("Newark", "NY"), Row("Brooklyn", "NY")), Map("hair" -> "black", "eyes" -> "brown", "height" -> "178")), 33 | Row("Kate", List(Row("San Jose", "CA"), Row("Sandiago", "CA")), Map("hair" -> "brown", "eyes" -> "black", "height" -> "165")), 34 | Row("William", List(Row("Las Vegas", "NV")), Map("hair" -> "red", "eye" -> "gray", "height" -> "185")), 35 | Row("Sarah", null, Map("hair" -> "blond", "eyes" -> "red", "height" -> "162")), 36 | Row("Michael", List(Row("Sacramento", "CA"), Row("San Diego", "CA")), Map("white" -> "black", "eyes" -> "black", "height" -> "180")) 37 | ) 38 | 39 | def getTestCase: DataFrame = { 40 | spark.createDataFrame( 41 | spark.sparkContext.parallelize(tstCaseData), 42 | testCaseSchema 43 | ).orderBy("name") 44 | } 45 | 46 | } 47 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/samples/NestedTestCaseFactory.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations.samples 18 | 19 | import org.apache.spark.sql.types._ 20 | import org.apache.spark.sql.{DataFrame, SparkSession, types} 21 | 22 | class NestedTestCaseFactory(implicit spark: SparkSession) { 23 | 24 | private val testCaseSchema = StructType( 25 | Array( 26 | StructField("id", LongType), 27 | StructField("key1", LongType), 28 | StructField("key2", LongType), 29 | StructField("struct1", StructType(Array( 30 | StructField("key3", IntegerType), 31 | StructField("key4", IntegerType) 32 | ))), 33 | StructField("struct2", StructType(Array( 34 | StructField("inner1", StructType(Array( 35 | StructField("key5", LongType), 36 | StructField("key6", LongType), 37 | StructField("skey1", StringType) 38 | ))) 39 | ))), 40 | StructField("struct3", StructType(Array( 41 | StructField("inner3", StructType(Array( 42 | StructField("array3", types.ArrayType(StructType(Array( 43 | StructField("a1", LongType), 44 | StructField("a2", LongType), 45 | StructField("a3", StringType) 46 | )))) 47 | ))) 48 | ))), 49 | StructField("array1", types.ArrayType(StructType(Array( 50 | StructField("key7", LongType), 51 | StructField("key8", LongType), 52 | StructField("skey2", StringType) 53 | )))), 54 | StructField("array2", types.ArrayType(StructType(Array( 55 | StructField("key2", LongType), 56 | StructField("inner2", types.ArrayType(StructType(Array( 57 | StructField("key9", LongType), 58 | StructField("key10", LongType), 59 | StructField("struct3", StructType(Array( 60 | StructField("k1", IntegerType), 61 | StructField("k2", IntegerType) 62 | ))) 63 | )))) 64 | )))) 65 | )) 66 | 67 | def getTestCase: DataFrame = { 68 | spark.read 69 | .schema(testCaseSchema) 70 | .json(getClass.getResource("/test_data/nested/nestedDf1.json").getPath) 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /src/test/scala/za/co/absa/spark/hats/transformations/samples/SampleErrorUDFs.scala: -------------------------------------------------------------------------------- 1 | /* 2 | * Copyright 2020 ABSA Group Limited 3 | * 4 | * Licensed under the Apache License, Version 2.0 (the "License"); 5 | * you may not use this file except in compliance with the License. 6 | * You may obtain a copy of the License at 7 | * 8 | * http://www.apache.org/licenses/LICENSE-2.0 9 | * 10 | * Unless required by applicable law or agreed to in writing, software 11 | * distributed under the License is distributed on an "AS IS" BASIS, 12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | * See the License for the specific language governing permissions and 14 | * limitations under the License. 15 | */ 16 | 17 | package za.co.absa.spark.hats.transformations.samples 18 | 19 | import org.apache.spark.sql.SparkSession 20 | 21 | import scala.collection.mutable 22 | 23 | case class SampleErrorUDFs()(implicit val spark: SparkSession) { 24 | 25 | spark.udf.register("confCastErr", { (errCol: String, rawValue: String) => 26 | ErrorMessage.confCastErr(errCol, rawValue) 27 | }) 28 | 29 | spark.udf.register("arrayDistinctErrors", 30 | (arr: mutable.WrappedArray[ErrorMessage]) => 31 | if (arr != null) { 32 | arr.distinct.filter((a: AnyRef) => a != null) 33 | } else { 34 | Seq[ErrorMessage]() 35 | } 36 | ) 37 | 38 | } 39 | -------------------------------------------------------------------------------- /version.sbt: -------------------------------------------------------------------------------- 1 | ThisBuild / version := "0.3.1-SNAPSHOT" 2 | --------------------------------------------------------------------------------