├── .editorconfig
├── .github
├── ISSUE_TEMPLATE
│ ├── bug_report.md
│ └── feature_request.md
└── workflows
│ ├── jacoco_check.yml
│ └── test.yml
├── .gitignore
├── LICENSE
├── README.md
├── build.sbt
├── project
├── Dependencies.scala
├── build.properties
└── plugins.sbt
├── publish.sbt
├── src
├── main
│ ├── scala
│ │ └── za
│ │ │ └── co
│ │ │ └── absa
│ │ │ └── spark
│ │ │ └── hats
│ │ │ ├── Extensions.scala
│ │ │ ├── transformations
│ │ │ ├── ArrayContext.scala
│ │ │ └── NestedArrayTransformations.scala
│ │ │ └── utils
│ │ │ ├── JsonUtils.scala
│ │ │ └── SchemaUtils.scala
│ ├── scala_2.11
│ │ └── za
│ │ │ └── co
│ │ │ └── absa
│ │ │ └── spark
│ │ │ └── hats
│ │ │ └── HofsWrapper.scala
│ ├── scala_2.12
│ │ └── za
│ │ │ └── co
│ │ │ └── absa
│ │ │ └── spark
│ │ │ └── hats
│ │ │ └── HofsWrapper.scala
│ └── scala_2.13
│ │ └── za
│ │ └── co
│ │ └── absa
│ │ └── spark
│ │ └── hats
│ │ └── HofsWrapper.scala
└── test
│ ├── resources
│ ├── log4j.properties
│ ├── log4j2.properties
│ └── test_data
│ │ └── nested
│ │ ├── nested10Results.json
│ │ ├── nested10Schema.txt
│ │ ├── nested1Results.json
│ │ ├── nested1Schema.txt
│ │ ├── nested2Results.json
│ │ ├── nested2Schema.txt
│ │ ├── nested3Results.json
│ │ ├── nested3Schema.txt
│ │ ├── nested4Results.json
│ │ ├── nested4Schema.txt
│ │ ├── nested5Results.json
│ │ ├── nested5Schema.txt
│ │ ├── nested6Results.json
│ │ ├── nested6Schema.txt
│ │ ├── nested7Results.json
│ │ ├── nested7Schema.txt
│ │ ├── nested8Results.json
│ │ ├── nested8Schema.txt
│ │ ├── nested9Results.json
│ │ ├── nested9Schema.txt
│ │ └── nestedDf1.json
│ └── scala
│ └── za
│ └── co
│ └── absa
│ └── spark
│ └── hats
│ ├── SparkTestBase.scala
│ └── transformations
│ ├── DeepArrayErrorTransformationSuite.scala
│ ├── DeepArrayTransformationSuite.scala
│ ├── ExtendedTransformationsSuite.scala
│ └── samples
│ ├── DeepArraySamples.scala
│ ├── ErrorMessage.scala
│ ├── NestedMapTestCaseFactory.scala
│ ├── NestedTestCaseFactory.scala
│ └── SampleErrorUDFs.scala
└── version.sbt
/.editorconfig:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 |
16 | # top-most EditorConfig file
17 | root = true
18 |
19 | [*]
20 | charset = utf-8
21 | end_of_line = lf
22 | trim_trailing_whitespace = true
23 |
24 | [*.xml]
25 | indent_size = 4
26 | indent_style = space
27 | insert_final_newline = true
28 |
29 | [*.properties]
30 | insert_final_newline = true
31 |
32 | [*.{java,scala,js,json,css}]
33 | indent_size = 2
34 | indent_style = space
35 | insert_final_newline = true
36 | max_line_length = 120
37 |
38 | [*.md]
39 | trim_trailing_whitespace = false
40 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Feature request
3 | about: Suggest an idea for this project
4 | title: ''
5 | labels: enhancement
6 | assignees: ''
7 |
8 | ---
9 |
10 | ## Background
11 | A clear and concise description of where the limitation lies.
12 |
13 | ## Feature
14 | A description of the requested feature.
15 |
16 | ## Example [Optional]
17 | A simple example if applicable.
18 |
19 | ## Proposed Solution [Optional]
20 | Solution Ideas
21 | 1.
22 | 2.
23 | 3.
24 |
--------------------------------------------------------------------------------
/.github/workflows/jacoco_check.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2018 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 |
16 | name: JaCoCo report
17 |
18 | on:
19 | pull_request:
20 | branches: [ master ]
21 | types: [ opened, edited, synchronize, reopened ]
22 |
23 | jobs:
24 | test:
25 | runs-on: ubuntu-latest
26 | strategy:
27 | fail-fast: false
28 | matrix:
29 | include:
30 | - scala: 2.11.12
31 | scalaShort: "2.11"
32 | spark: 2.4.8
33 | overall: 0.0
34 | changed: 80.0
35 | - scala: 2.12.18
36 | scalaShort: "2.12"
37 | spark: 3.2.4
38 | overall: 0.0
39 | changed: 80.0
40 | - scala: 2.13.11
41 | scalaShort: "2.13"
42 | spark: 3.4.1
43 | overall: 0.0
44 | changed: 80.0
45 | name: Check code-coverage by JaCoCo - Spark ${{matrix.spark}} on Scala ${{matrix.scala}}
46 | steps:
47 | - name: Checkout code
48 | uses: actions/checkout@v2
49 | - uses: coursier/cache-action@v5
50 | - name: Setup Scala
51 | uses: olafurpg/setup-scala@v10
52 | with:
53 | java-version: "adopt@1.8"
54 | - name: Build and run tests
55 | run: sbt ++${{matrix.scala}} jacoco -DSPARK_VERSION=${{matrix.spark}}
56 | - name: Add coverage to PR for Scala ${{matrix.scala}} & Spark ${{matrix.spark}}
57 | id: jacoco
58 | uses: madrapps/jacoco-report@v1.3
59 | with:
60 | paths: ${{ github.workspace }}/target/scala-${{ matrix.scalaShort }}/jacoco/report/jacoco.xml
61 | token: ${{ secrets.GITHUB_TOKEN }}
62 | min-coverage-overall: ${{ matrix.overall }}
63 | min-coverage-changed-files: ${{ matrix.changed }}
64 | title: JaCoCo code coverage report - Scala ${{ matrix.scala }} & Spark ${{ matrix.spark }}
65 | update-comment: true
66 | - name: Get the Coverage info
67 | run: |
68 | echo "Total coverage ${{ steps.jacoco.outputs.coverage-overall }}"
69 | echo "Changed Files coverage ${{ steps.jacoco.outputs.coverage-changed-files }}"
70 | - name: Fail PR if changed files coverage is less than ${{ matrix.changed }}%
71 | if: ${{ steps.jacoco.outputs.coverage-changed-files < 80.0 }}
72 | uses: actions/github-script@v6
73 | with:
74 | script: |
75 | core.setFailed('Changed files coverage is less than ${{ matrix.changed }}%!')
76 |
77 |
--------------------------------------------------------------------------------
/.github/workflows/test.yml:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 |
16 | name: Build
17 |
18 | on:
19 | push:
20 | branches: [ master ]
21 | pull_request:
22 | branches: [ master ]
23 |
24 | jobs:
25 | test:
26 | runs-on: ubuntu-latest
27 | strategy:
28 | fail-fast: false
29 | matrix:
30 | scala: [2.11.12, 2.12.18, 2.13.11]
31 | spark: [2.4.8, 3.2.4, 3.4.1]
32 | exclude:
33 | - scala: 2.11.12
34 | spark: 3.2.4
35 | - scala: 2.11.12
36 | spark: 3.4.1
37 | - scala: 2.12.18
38 | spark: 2.4.8
39 | - scala: 2.13.11
40 | spark: 2.4.8
41 | name: Test Spark ${{matrix.spark}} on Scala ${{matrix.scala}}
42 | steps:
43 | - name: Checkout code
44 | uses: actions/checkout@v2
45 | - uses: coursier/cache-action@v5
46 | - name: Setup Scala
47 | uses: olafurpg/setup-scala@v10
48 | with:
49 | java-version: "adopt@1.8"
50 | - name: Build and run tests
51 | run: sbt ++${{matrix.scala}} test -DSPARK_VERSION=${{matrix.spark}}
52 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 |
16 | # use glob syntax.
17 | syntax: glob
18 | *.ser
19 | *.class
20 | *~
21 | *.bak
22 | #*.off
23 | *.old
24 |
25 | # eclipse conf file
26 | .settings
27 | .classpath
28 | .project
29 | .manager
30 | .scala_dependencies
31 |
32 | # idea
33 | .idea
34 | *.iml
35 |
36 | # building
37 | target
38 | build
39 | null
40 | tmp*
41 | temp*
42 | dist
43 | test-output
44 | build.log
45 |
46 | # other scm
47 | .svn
48 | .CVS
49 | .hg*
50 |
51 | .cache*
52 | dependency-reduced-pom.xml
53 |
54 | .bsp
55 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | Apache License
3 | Version 2.0, January 2004
4 | http://www.apache.org/licenses/
5 |
6 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7 |
8 | 1. Definitions.
9 |
10 | "License" shall mean the terms and conditions for use, reproduction,
11 | and distribution as defined by Sections 1 through 9 of this document.
12 |
13 | "Licensor" shall mean the copyright owner or entity authorized by
14 | the copyright owner that is granting the License.
15 |
16 | "Legal Entity" shall mean the union of the acting entity and all
17 | other entities that control, are controlled by, or are under common
18 | control with that entity. For the purposes of this definition,
19 | "control" means (i) the power, direct or indirect, to cause the
20 | direction or management of such entity, whether by contract or
21 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
22 | outstanding shares, or (iii) beneficial ownership of such entity.
23 |
24 | "You" (or "Your") shall mean an individual or Legal Entity
25 | exercising permissions granted by this License.
26 |
27 | "Source" form shall mean the preferred form for making modifications,
28 | including but not limited to software source code, documentation
29 | source, and configuration files.
30 |
31 | "Object" form shall mean any form resulting from mechanical
32 | transformation or translation of a Source form, including but
33 | not limited to compiled object code, generated documentation,
34 | and conversions to other media types.
35 |
36 | "Work" shall mean the work of authorship, whether in Source or
37 | Object form, made available under the License, as indicated by a
38 | copyright notice that is included in or attached to the work
39 | (an example is provided in the Appendix below).
40 |
41 | "Derivative Works" shall mean any work, whether in Source or Object
42 | form, that is based on (or derived from) the Work and for which the
43 | editorial revisions, annotations, elaborations, or other modifications
44 | represent, as a whole, an original work of authorship. For the purposes
45 | of this License, Derivative Works shall not include works that remain
46 | separable from, or merely link (or bind by name) to the interfaces of,
47 | the Work and Derivative Works thereof.
48 |
49 | "Contribution" shall mean any work of authorship, including
50 | the original version of the Work and any modifications or additions
51 | to that Work or Derivative Works thereof, that is intentionally
52 | submitted to Licensor for inclusion in the Work by the copyright owner
53 | or by an individual or Legal Entity authorized to submit on behalf of
54 | the copyright owner. For the purposes of this definition, "submitted"
55 | means any form of electronic, verbal, or written communication sent
56 | to the Licensor or its representatives, including but not limited to
57 | communication on electronic mailing lists, source code control systems,
58 | and issue tracking systems that are managed by, or on behalf of, the
59 | Licensor for the purpose of discussing and improving the Work, but
60 | excluding communication that is conspicuously marked or otherwise
61 | designated in writing by the copyright owner as "Not a Contribution."
62 |
63 | "Contributor" shall mean Licensor and any individual or Legal Entity
64 | on behalf of whom a Contribution has been received by Licensor and
65 | subsequently incorporated within the Work.
66 |
67 | 2. Grant of Copyright License. Subject to the terms and conditions of
68 | this License, each Contributor hereby grants to You a perpetual,
69 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70 | copyright license to reproduce, prepare Derivative Works of,
71 | publicly display, publicly perform, sublicense, and distribute the
72 | Work and such Derivative Works in Source or Object form.
73 |
74 | 3. Grant of Patent License. Subject to the terms and conditions of
75 | this License, each Contributor hereby grants to You a perpetual,
76 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77 | (except as stated in this section) patent license to make, have made,
78 | use, offer to sell, sell, import, and otherwise transfer the Work,
79 | where such license applies only to those patent claims licensable
80 | by such Contributor that are necessarily infringed by their
81 | Contribution(s) alone or by combination of their Contribution(s)
82 | with the Work to which such Contribution(s) was submitted. If You
83 | institute patent litigation against any entity (including a
84 | cross-claim or counterclaim in a lawsuit) alleging that the Work
85 | or a Contribution incorporated within the Work constitutes direct
86 | or contributory patent infringement, then any patent licenses
87 | granted to You under this License for that Work shall terminate
88 | as of the date such litigation is filed.
89 |
90 | 4. Redistribution. You may reproduce and distribute copies of the
91 | Work or Derivative Works thereof in any medium, with or without
92 | modifications, and in Source or Object form, provided that You
93 | meet the following conditions:
94 |
95 | (a) You must give any other recipients of the Work or
96 | Derivative Works a copy of this License; and
97 |
98 | (b) You must cause any modified files to carry prominent notices
99 | stating that You changed the files; and
100 |
101 | (c) You must retain, in the Source form of any Derivative Works
102 | that You distribute, all copyright, patent, trademark, and
103 | attribution notices from the Source form of the Work,
104 | excluding those notices that do not pertain to any part of
105 | the Derivative Works; and
106 |
107 | (d) If the Work includes a "NOTICE" text file as part of its
108 | distribution, then any Derivative Works that You distribute must
109 | include a readable copy of the attribution notices contained
110 | within such NOTICE file, excluding those notices that do not
111 | pertain to any part of the Derivative Works, in at least one
112 | of the following places: within a NOTICE text file distributed
113 | as part of the Derivative Works; within the Source form or
114 | documentation, if provided along with the Derivative Works; or,
115 | within a display generated by the Derivative Works, if and
116 | wherever such third-party notices normally appear. The contents
117 | of the NOTICE file are for informational purposes only and
118 | do not modify the License. You may add Your own attribution
119 | notices within Derivative Works that You distribute, alongside
120 | or as an addendum to the NOTICE text from the Work, provided
121 | that such additional attribution notices cannot be construed
122 | as modifying the License.
123 |
124 | You may add Your own copyright statement to Your modifications and
125 | may provide additional or different license terms and conditions
126 | for use, reproduction, or distribution of Your modifications, or
127 | for any such Derivative Works as a whole, provided Your use,
128 | reproduction, and distribution of the Work otherwise complies with
129 | the conditions stated in this License.
130 |
131 | 5. Submission of Contributions. Unless You explicitly state otherwise,
132 | any Contribution intentionally submitted for inclusion in the Work
133 | by You to the Licensor shall be under the terms and conditions of
134 | this License, without any additional terms or conditions.
135 | Notwithstanding the above, nothing herein shall supersede or modify
136 | the terms of any separate license agreement you may have executed
137 | with Licensor regarding such Contributions.
138 |
139 | 6. Trademarks. This License does not grant permission to use the trade
140 | names, trademarks, service marks, or product names of the Licensor,
141 | except as required for reasonable and customary use in describing the
142 | origin of the Work and reproducing the content of the NOTICE file.
143 |
144 | 7. Disclaimer of Warranty. Unless required by applicable law or
145 | agreed to in writing, Licensor provides the Work (and each
146 | Contributor provides its Contributions) on an "AS IS" BASIS,
147 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148 | implied, including, without limitation, any warranties or conditions
149 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150 | PARTICULAR PURPOSE. You are solely responsible for determining the
151 | appropriateness of using or redistributing the Work and assume any
152 | risks associated with Your exercise of permissions under this License.
153 |
154 | 8. Limitation of Liability. In no event and under no legal theory,
155 | whether in tort (including negligence), contract, or otherwise,
156 | unless required by applicable law (such as deliberate and grossly
157 | negligent acts) or agreed to in writing, shall any Contributor be
158 | liable to You for damages, including any direct, indirect, special,
159 | incidental, or consequential damages of any character arising as a
160 | result of this License or out of the use or inability to use the
161 | Work (including but not limited to damages for loss of goodwill,
162 | work stoppage, computer failure or malfunction, or any and all
163 | other commercial damages or losses), even if such Contributor
164 | has been advised of the possibility of such damages.
165 |
166 | 9. Accepting Warranty or Additional Liability. While redistributing
167 | the Work or Derivative Works thereof, You may choose to offer,
168 | and charge a fee for, acceptance of support, warranty, indemnity,
169 | or other liability obligations and/or rights consistent with this
170 | License. However, in accepting such obligations, You may act only
171 | on Your own behalf and on Your sole responsibility, not on behalf
172 | of any other Contributor, and only if You agree to indemnify,
173 | defend, and hold each Contributor harmless for any liability
174 | incurred by, or claims asserted against, such Contributor by reason
175 | of your accepting any such warranty or additional liability.
176 |
177 | END OF TERMS AND CONDITIONS
178 |
179 | APPENDIX: How to apply the Apache License to your work.
180 |
181 | To apply the Apache License to your work, attach the following
182 | boilerplate notice, with the fields enclosed by brackets "[]"
183 | replaced with your own identifying information. (Don't include
184 | the brackets!) The text should be enclosed in the appropriate
185 | comment syntax for the file format. We also recommend that a
186 | file or class name and description of purpose be included on the
187 | same "printed page" as the copyright notice for easier
188 | identification within third-party archives.
189 |
190 | Copyright [yyyy] [name of copyright owner]
191 |
192 | Licensed under the Apache License, Version 2.0 (the "License");
193 | you may not use this file except in compliance with the License.
194 | You may obtain a copy of the License at
195 |
196 | http://www.apache.org/licenses/LICENSE-2.0
197 |
198 | Unless required by applicable law or agreed to in writing, software
199 | distributed under the License is distributed on an "AS IS" BASIS,
200 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
201 | See the License for the specific language governing permissions and
202 | limitations under the License.
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # spark-hats
2 | [](https://github.com/AbsaOSS/spark-hats/actions)
3 | [](https://app.fossa.com/projects/git%2Bgithub.com%2FAbsaOSS%2Fspark-hats?ref=badge_shield)
4 |
5 | Spark "**H**elpers for **A**rray **T**ransformation**s**"
6 |
7 | This library extends Spark DataFrame API with helpers for transforming fields inside nested structures and arrays of
8 | arbitrary levels of nesting.
9 |
10 | ## Usage
11 |
12 | Reference the library
13 |
14 |
15 | Scala 2.11 | Scala 2.12 | Scala 2.13 |
16 |
17 |
18 | 
19 | |
20 |
21 | 
22 | |
23 |
24 | 
25 | |
26 |
27 |
28 |
29 | groupId: za.co.absa artifactId: spark-hats_2.11 version: 0.3.0
30 | |
31 |
32 | groupId: za.co.absa artifactId: spark-hats_2.12 version: 0.3.0
33 | |
34 |
35 | groupId: za.co.absa artifactId: spark-hats_2.13 version: 0.3.0
36 | |
37 |
38 |
39 |
40 | Please, use the table below to determine what version of spark-hats to use for Spark compatibility.
41 |
42 | | spark-hats version | Scala version | Spark version |
43 | |:------------------:|:-------------:|:-------------:|
44 | | 0.1.x | 2.11, 2.12 | 2.4.3+ |
45 | | 0.2.x | 2.11, 2.12 | 2.4.3+ |
46 | | 0.2.x | 2.12 | 3.0.0+ |
47 | | 0.3.x | 2.11 | 2.4.3+ |
48 | | 0.3.x | 2.12, 2.13 | 3.2.1+ |
49 |
50 | To use the extensions you need to add this import to your Spark application or shell:
51 | ```scala
52 | import za.co.absa.spark.hats.Extensions._
53 | ```
54 |
55 | ### How to generate Code coverage report
56 | ```
57 | sbt ++{matrix.scala} jacoco -DSPARK_VERSION={matrix.spark}
58 | ```
59 | Code coverage will be generated on path:
60 | ```
61 | {project-root}/spark-hats/target/scala-{scala_version}/jacoco/report/html
62 | ```
63 |
64 |
65 | ## Motivation
66 |
67 | Here is a small example we will use to show you how `spark-hats` work. The important thing is that the dataframe
68 | contains an array of struct fields.
69 |
70 | ```scala
71 | scala> df.printSchema()
72 | root
73 | |-- id: long (nullable = true)
74 | |-- my_array: array (nullable = true)
75 | | |-- element: struct (containsNull = true)
76 | | | |-- a: long (nullable = true)
77 | | | |-- b: string (nullable = true)
78 |
79 | scala> df.show(false)
80 | +---+------------------------------+
81 | |id |my_array |
82 | +---+------------------------------+
83 | |1 |[[1, foo]] |
84 | |2 |[[1, bar], [2, baz], [3, foz]]|
85 | +---+------------------------------+
86 | ```
87 |
88 | Now, say, we want to add a field `c` as part of the struct alongside `a` and `b` from the example above. The
89 | expression for `c` is `c = a + 1`.
90 |
91 | Here is the code you can use in Spark:
92 | ```scala
93 | val dfOut = df.select(col("id"), transform(col("my_array"), c => {
94 | struct(c.getField("a").as("a"),
95 | c.getField("b").as("b"),
96 | (c.getField("a") + 1).as("c"))
97 | }).as("my_array"))
98 |
99 | ```
100 | (to use `transform()` in Scala API you need to add [spark-hofs](https://github.com/AbsaOSS/spark-hofs) as a dependency).
101 |
102 | Here is how it looks when using `spark-hats` library.
103 | ```scala
104 | val dfOut = df.nestedMapColumn("my_array.a","c", a => a + 1)
105 | ```
106 |
107 | Both produce the following results:
108 | ```scala
109 | scala> dfOut.printSchema
110 | root
111 | |-- id: long (nullable = true)
112 | |-- my_array: array (nullable = true)
113 | | |-- element: struct (containsNull = false)
114 | | | |-- a: long (nullable = true)
115 | | | |-- b: string (nullable = true)
116 | | | |-- c: long (nullable = true)
117 |
118 | scala> dfOut.show(false)
119 | +---+---------------------------------------+
120 | |id |my_array |
121 | +---+---------------------------------------+
122 | |1 |[[1, foo, 2]] |
123 | |2 |[[1, bar, 2], [2, baz, 3], [3, foz, 4]]|
124 | +---+---------------------------------------+
125 | ```
126 |
127 | Imagine how the code will look like for more levels of array nesting.
128 |
129 | ## Methods
130 |
131 | ### Add a column
132 | The `nestedWithColumn` method allows adding new fields inside nested structures and arrays.
133 |
134 | The addition of a column API is provided in two flavors: the basic and the extended API. The basic API is simpler to
135 | use, but the expressions it expects can only reference columns at the root of the schema. Here is an example of the basic add
136 | column API:
137 |
138 | ```scala
139 | scala> df.nestedWithColumn("my_array.c", lit("hello")).printSchema
140 | root
141 | |-- id: long (nullable = true)
142 | |-- my_array: array (nullable = true)
143 | | |-- element: struct (containsNull = false)
144 | | | |-- a: long (nullable = true)
145 | | | |-- b: string (nullable = true)
146 | | | |-- c: string (nullable = false)
147 |
148 | scala> df.nestedWithColumn("my_array.c", lit("hello")).show(false)
149 | +---+---------------------------------------------------+
150 | |id |my_array |
151 | +---+---------------------------------------------------+
152 | |1 |[[1, foo, hello]] |
153 | |2 |[[1, bar, hello], [2, baz, hello], [3, foz, hello]]|
154 | +---+---------------------------------------------------+
155 | ```
156 |
157 | ### Add column (extended)
158 | The extended API method `nestedWithColumnExtended` works similarly to the basic one but allows the caller to reference
159 | other array elements, possibly on different levels of nesting. The way it allows this is a little tricky.
160 | The second parameter is changed from being a column to a *function that returns a column*. Moreover, this function has
161 | an argument which is a function itself, the `getField()` function. The `getField()` function can be used in the
162 | transformation to reference other columns in the dataframe by their fully qualified name.
163 |
164 | In the following example, a transformation adds a new field `my_array.c` to the dataframe by concatenating a root
165 | level column `id` with a nested field `my_array.b`:
166 |
167 | ```scala
168 | scala> val dfOut = df.nestedWithColumnExtended("my_array.c", getField =>
169 | concat(getField("id").cast("string"), getField("my_array.b"))
170 | )
171 |
172 | scala> dfOut.printSchema
173 | root
174 | |-- id: long (nullable = true)
175 | |-- my_array: array (nullable = true)
176 | | |-- element: struct (containsNull = false)
177 | | | |-- a: long (nullable = true)
178 | | | |-- b: string (nullable = true)
179 | | | |-- c: string (nullable = true)
180 |
181 | scala> dfOut.show(false)
182 | +---+------------------------------------------------+
183 | |id |my_array |
184 | +---+------------------------------------------------+
185 | |1 |[[1, foo, 1foo]] |
186 | |2 |[[1, bar, 2bar], [2, baz, 2baz], [3, foz, 2foz]]|
187 | +---+------------------------------------------------+
188 | ```
189 |
190 | * **Note.** You can still use `col` to reference root level columns. But if a column is inside an array (like
191 | `my_array.b`), invoking `col("my_array.b")` will reference the whole array, not an individual element. The `getField()`
192 | function that is passed to the transformation solves this by adding a generic way of addressing array elements on arbitrary
193 | levels of nesting.
194 |
195 | * **Advanced Note.** If there are several arrays in the schema, `getField()` allows to reference elements of an array
196 | if it is one of the parents of the output column.
197 |
198 |
199 | ### Drop a column
200 | The `nestedDropColumn` method allows dropping fields inside nested structures and arrays.
201 |
202 |
203 | ```scala
204 | scala> df.nestedDropColumn("my_array.b").printSchema
205 | root
206 | |-- id: long (nullable = true)
207 | |-- my_array: array (nullable = true)
208 | | |-- element: struct (containsNull = false)
209 | | | |-- a: long (nullable = true)
210 |
211 | scala> df.nestedDropColumn("my_array.b").show(false)
212 | +---+---------------+
213 | |id |my_array |
214 | +---+---------------+
215 | |1 |[[1]] |
216 | |2 |[[1], [2], [3]]|
217 | +---+---------------+
218 | ```
219 |
220 | ### Map a column
221 |
222 | The `nestedMapColumn` method applies a transformation on a nested field. If the input column is a primitive field the
223 | method will add `outputColumnName` at the same level of nesting. If a struct column is expected you can use
224 | `.getField(...)` method to operate on its children.
225 |
226 | The output column name can omit the full path as the field will be created at the same level of nesting as the input column.
227 |
228 | ```scala
229 | scala> df.nestedMapColumn(inputColumnName = "my_array.a", outputColumnName = "c", expression = a => a + 1).printSchema
230 | root
231 | |-- id: long (nullable = true)
232 | |-- my_array: array (nullable = true)
233 | | |-- element: struct (containsNull = false)
234 | | | |-- a: long (nullable = true)
235 | | | |-- b: string (nullable = true)
236 | | | |-- c: long (nullable = true)
237 |
238 | scala> df.nestedMapColumn(inputColumnName = "my_array.a", outputColumnName = "c", expression = a => a + 1).show(false)
239 | +---+---------------------------------------+
240 | |id |my_array |
241 | +---+---------------------------------------+
242 | |1 |[[1, foo, 2]] |
243 | |2 |[[1, bar, 2], [2, baz, 3], [3, foz, 4]]|
244 | +---+---------------------------------------+
245 | ```
246 |
247 | ## Other transformations
248 |
249 | ### Unstruct
250 |
251 | Syntax: `df.nestedUnstruct("NestedStructColumnName")`.
252 |
253 | Flattens one level of nesting when a struct is nested in another struct. For example,
254 |
255 | ```scala
256 | scala> df.printSchema
257 | root
258 | |-- id: long (nullable = true)
259 | |-- my_array: array (nullable = true)
260 | | |-- element: struct (containsNull = true)
261 | | | |-- a: long (nullable = true)
262 | | | |-- b: string (nullable = true)
263 | | | |-- c: struct (containsNull = true)
264 | | | | |--nestedField1: string (nullable = true)
265 | | | | |--nestedField2: long (nullable = true)
266 |
267 | scala> df.nestedUnstruct("my_array.c").printSchema
268 | root
269 | |-- id: long (nullable = true)
270 | |-- my_array: array (nullable = true)
271 | | |-- element: struct (containsNull = true)
272 | | | |-- a: long (nullable = true)
273 | | | |-- b: string (nullable = true)
274 | | | |-- nestedField1: string (nullable = true)
275 | | | |-- nestedField2: long (nullable = true)
276 | ```
277 |
278 | Note that the output schema doesn't have the `c` struct. All fields of `c` are now part of the parent struct.
279 |
280 | ## Changelog
281 | - #### 0.3.0 released 3 August 2023.
282 | - [#38](https://github.com/AbsaOSS/spark-hats/issues/38) Add scala 2.13 support.
283 | - [#33](https://github.com/AbsaOSS/spark-hats/issues/33) Update spark test to 3.2.1.
284 | - [#35](https://github.com/AbsaOSS/spark-hats/issues/35) Add code coverage support.
285 |
286 | - #### 0.2.2 released 8 March 2021.
287 | - [#23](https://github.com/AbsaOSS/spark-hats/issues/23) Added `nestedUnstruct()` method that flattens one level of nesting for a given struct.
288 |
289 | - #### 0.2.1 released 21 January 2020.
290 | - [#10](https://github.com/AbsaOSS/spark-hats/issues/10) Fixed error column aggregation when the input array is `null`.
291 |
292 | - #### 0.2.0 released 16 January 2020.
293 | - [#5](https://github.com/AbsaOSS/spark-hats/issues/5) Added the extended nested transformation API that allows referencing arbitrary columns.
294 |
295 |
296 | ## License
297 | [](https://app.fossa.com/projects/git%2Bgithub.com%2FAbsaOSS%2Fspark-hats?ref=badge_large)
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import Dependencies._
18 |
19 | val scala211 = "2.11.12"
20 | val scala212 = "2.12.18"
21 | val scala213 = "2.13.11"
22 |
23 | ThisBuild / organization := "za.co.absa"
24 |
25 | ThisBuild / scalaVersion := scala212
26 | ThisBuild / crossScalaVersions := Seq(scala211, scala212, scala213)
27 |
28 | ThisBuild / scalacOptions := Seq("-unchecked", "-deprecation")
29 |
30 | // Scala shouldn't be packaged so it is explicitly added as a provided dependency below
31 | ThisBuild / autoScalaLibrary := false
32 |
33 | lazy val printSparkVersion = taskKey[Unit]("Print Spark version spark-cobol is building against.")
34 |
35 | lazy val hats = (project in file("."))
36 | .settings(
37 | name := "spark-hats",
38 | printSparkVersion := {
39 | val log = streams.value.log
40 | val effectiveSparkVersion = sparkVersion(scalaVersion.value)
41 | log.info(s"Building with Spark $effectiveSparkVersion")
42 | effectiveSparkVersion
43 | },
44 | Compile / compile := ((Compile / compile) dependsOn printSparkVersion).value,
45 | Compile / unmanagedSourceDirectories += {
46 | val sourceDir = (Compile / sourceDirectory).value
47 | CrossVersion.partialVersion(scalaVersion.value) match {
48 | case Some((2, n)) if n == 11 => sourceDir / "scala_2.11"
49 | case Some((2, n)) if n == 12 => sourceDir / "scala_2.12"
50 | case Some((2, n)) if n == 13 => sourceDir / "scala_2.13"
51 | case _ => throw new RuntimeException("Unsupported Scala version")
52 | }
53 | },
54 | libraryDependencies ++= getSparkHatsDependencies(scalaVersion.value) ++ getHofsDependency(scalaVersion.value) :+ getScalaDependency(scalaVersion.value),
55 | releasePublishArtifactsAction := PgpKeys.publishSigned.value,
56 | Test / fork := true
57 | ).enablePlugins(AutomateHeaderPlugin)
58 |
59 | // release settings
60 | releaseCrossBuild := true
61 | addCommandAlias("releaseNow", ";set releaseVersionBump := sbtrelease.Version.Bump.Bugfix; release with-defaults")
62 |
63 | // JaCoCo code coverage
64 | Test / jacocoReportSettings := JacocoReportSettings(
65 | title = s"spark-hats Jacoco Report - ${scalaVersion.value}",
66 | formats = Seq(JacocoReportFormats.HTML, JacocoReportFormats.XML)
67 | )
68 |
69 | // exclude example
70 | Test / jacocoExcludes := Seq(
71 | // "za.co.absa.spark.hats.transformations.NestedArrayTransformation*", // class and related objects
72 | // "za.co.absa.spark.hats.transformations.ArrayContext" // class only
73 | )
74 |
--------------------------------------------------------------------------------
/project/Dependencies.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | import sbt._
18 |
19 | object Dependencies {
20 |
21 | val defaultSparkVersionForScala211 = "2.4.8"
22 | val defaultSparkVersionForScala212 = "3.3.2"
23 | val defaultSparkVersionForScala213 = "3.4.1"
24 |
25 | private val sparkHofsVersion = "0.4.0"
26 | private val scalatestVersion = "3.2.14"
27 |
28 | def getScalaDependency(scalaVersion: String): ModuleID = "org.scala-lang" % "scala-library" % scalaVersion % Provided
29 |
30 | def getSparkHatsDependencies(scalaVersion: String): Seq[ModuleID] = Seq(
31 | // provided
32 | "org.apache.spark" %% "spark-core" % sparkVersion(scalaVersion) % Provided,
33 | "org.apache.spark" %% "spark-sql" % sparkVersion(scalaVersion) % Provided,
34 | "org.apache.spark" %% "spark-catalyst" % sparkVersion(scalaVersion) % Provided,
35 |
36 | // test
37 | "org.scalatest" %% "scalatest" % scalatestVersion % Test
38 | )
39 |
40 | def getHofsDependency(scalaVersion: String): Seq[ModuleID] = if (scalaVersion.startsWith("2.11.")) {
41 | Seq("za.co.absa" %% "spark-hofs" % sparkHofsVersion)
42 | } else {
43 | Seq.empty
44 | }
45 |
46 | def sparkVersion(scalaVersion: String): String = sys.props.getOrElse("SPARK_VERSION", sparkFallbackVersion(scalaVersion))
47 |
48 | def sparkFallbackVersion(scalaVersion: String): String = {
49 | if (scalaVersion.startsWith("2.11.")) {
50 | defaultSparkVersionForScala211
51 | } else if (scalaVersion.startsWith("2.12.")) {
52 | defaultSparkVersionForScala212
53 | } else if (scalaVersion.startsWith("2.13.")) {
54 | defaultSparkVersionForScala213
55 | } else {
56 | throw new IllegalArgumentException(s"Scala $scalaVersion not supported.")
57 | }
58 | }
59 | }
60 |
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | #
2 | # Copyright 2019 ABSA Group Limited
3 | #
4 | # Licensed under the Apache License, Version 2.0 (the "License");
5 | # you may not use this file except in compliance with the License.
6 | # You may obtain a copy of the License at
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | #
15 |
16 | sbt.version=1.9.2
17 |
--------------------------------------------------------------------------------
/project/plugins.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2019 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.2.1")
18 | addSbtPlugin("com.github.sbt" % "sbt-release" % "1.1.0")
19 | addSbtPlugin("de.heikoseeberger" % "sbt-header" % "5.7.0")
20 |
21 | // sbt-jacoco - workaround related dependencies required to download
22 | lazy val ow2Version = "9.5"
23 | lazy val jacocoVersion = "0.8.10-absa.1"
24 |
25 | def jacocoUrl(artifactName: String): String = s"https://github.com/AbsaOSS/jacoco/releases/download/$jacocoVersion/org.jacoco.$artifactName-$jacocoVersion.jar"
26 | def ow2Url(artifactName: String): String = s"https://repo1.maven.org/maven2/org/ow2/asm/$artifactName/$ow2Version/$artifactName-$ow2Version.jar"
27 |
28 | addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.11/2.0/scala-arm_2.11-2.0.jar")
29 | addSbtPlugin("com.jsuereth" %% "scala-arm" % "2.0" from "https://repo1.maven.org/maven2/com/jsuereth/scala-arm_2.12/2.0/scala-arm_2.12-2.0.jar")
30 |
31 | addSbtPlugin("za.co.absa.jacoco" % "report" % jacocoVersion from jacocoUrl("report"))
32 | addSbtPlugin("za.co.absa.jacoco" % "core" % jacocoVersion from jacocoUrl("core"))
33 | addSbtPlugin("za.co.absa.jacoco" % "agent" % jacocoVersion from jacocoUrl("agent"))
34 | addSbtPlugin("org.ow2.asm" % "asm" % ow2Version from ow2Url("asm"))
35 | addSbtPlugin("org.ow2.asm" % "asm-commons" % ow2Version from ow2Url("asm-commons"))
36 | addSbtPlugin("org.ow2.asm" % "asm-tree" % ow2Version from ow2Url("asm-tree"))
37 |
38 | addSbtPlugin("za.co.absa.sbt" % "sbt-jacoco" % "3.4.1-absa.3" from "https://github.com/AbsaOSS/sbt-jacoco/releases/download/3.4.1-absa.3/sbt-jacoco-3.4.1-absa.3.jar")
39 |
--------------------------------------------------------------------------------
/publish.sbt:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | ThisBuild / organizationName := "ABSA Group Limited"
18 | ThisBuild / organizationHomepage := Some(url("https://www.absa.africa"))
19 | ThisBuild / scmInfo := Some(
20 | ScmInfo(
21 | browseUrl = url("https://github.com/AbsaOSS/spark-hats/tree/master"),
22 | connection = "scm:git:ssh://github.com/AbsaOSS/spark-hats.git",
23 | devConnection = "scm:git:ssh://github.com/AbsaOSS/spark-hats.git"
24 | )
25 | )
26 |
27 | ThisBuild / developers := List(
28 | Developer(
29 | id = "yruslan",
30 | name = "Ruslan Iushchenko",
31 | email = "ruslan.iushchenko@absa.africa",
32 | url = url("https://github.com/yruslan")
33 | )
34 | )
35 |
36 | ThisBuild / homepage := Some(url("https://github.com/AbsaOSS/spark-hats"))
37 | ThisBuild / description := "Spark extensions for working with nested arrays and structs"
38 | ThisBuild / startYear := Some(2020)
39 | ThisBuild / licenses += "Apache-2.0" -> url("https://www.apache.org/licenses/LICENSE-2.0.txt")
40 |
41 | ThisBuild / pomIncludeRepository := { _ => false }
42 | ThisBuild / publishTo := {
43 | val nexus = "https://oss.sonatype.org/"
44 | if (isSnapshot.value) {
45 | Some("snapshots" at s"${nexus}content/repositories/snapshots")
46 | } else {
47 | Some("releases" at s"${nexus}service/local/staging/deploy/maven2")
48 | }
49 | }
50 | ThisBuild / publishMavenStyle := true
51 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/spark/hats/Extensions.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats
18 |
19 | import org.apache.spark.sql.types.StructType
20 | import org.apache.spark.sql.{Column, DataFrame, Dataset, Row}
21 | import za.co.absa.spark.hats.transformations.NestedArrayTransformations
22 |
23 | /**
24 | * The object is a container of extension methods for Spark DataFrames.
25 | */
26 | object Extensions {
27 |
28 | type TransformFunction = Column => Column
29 |
30 | type ExtendedTransformFunction = (String => Column) => Column
31 |
32 | /**
33 | * The class represents an extension wrapper for an [[org.apache.spark.sql.DataFrame]].
34 | *
35 | * @param dataset A data frame to be extended with methods contained in this class.
36 | */
37 | implicit class DataFrameExtension(dataset: Dataset[Row]) {
38 |
39 | /**
40 | * Add a column that can be inside nested structs, arrays and its combinations.
41 | *
42 | * @param newColumnName A column name to be created
43 | * @param expression A new column value
44 | * @return A dataframe with a new field that contains transformed values.
45 | */
46 | def nestedWithColumn(newColumnName: String,
47 | expression: Column): Dataset[Row] = {
48 | NestedArrayTransformations.nestedAddColumn(dataset, newColumnName, expression)
49 | }
50 |
51 | /**
52 | * Add a column that can be inside nested structs, arrays and its combinations.
53 | *
54 | * @param newColumnName A column name to be created
55 | * @param expression A new column value
56 | * @return A dataframe with a new field that contains transformed values.
57 | */
58 | def nestedWithColumnExtended(newColumnName: String,
59 | expression: ExtendedTransformFunction): Dataset[Row] = {
60 | NestedArrayTransformations.nestedAddColumnExtended(dataset, newColumnName,
61 | (_, getFieldFunction) => expression(getFieldFunction))
62 | }
63 |
64 | /**
65 | * Drop a column from inside a nested structs, arrays and its combinations
66 | *
67 | * @param columnToDrop A column name to be dropped
68 | * @return A dataframe with a new field that contains transformed values.
69 | */
70 | def nestedDropColumn(columnToDrop: String): DataFrame = {
71 | NestedArrayTransformations.nestedDropColumn(dataset, columnToDrop)
72 | }
73 |
74 | /**
75 | * Map transformation for columns that can be inside nested structs, arrays and its combinations.
76 | *
77 | * If the input column is a primitive field the method will add outputColumnName at the same level of nesting
78 | * by executing the `expression` passing the source column into it. If a struct column is expected you can
79 | * use `.getField(...)` method to operate on its children.
80 | *
81 | * The output column name can omit the full path as the field will be created at the same level of nesting as the input column.
82 | *
83 | * @param inputColumnName A column name for which to apply the transformation, e.g. `company.employee.firstName`.
84 | * @param outputColumnName The output column name. The path is optional, e.g. you can use `conformedName` instead of `company.employee.conformedName`.
85 | * @param expression A function that applies a transformation to a column as a Spark expression.
86 | * @return A dataframe with a new field that contains transformed values.
87 | */
88 | def nestedMapColumn(inputColumnName: String,
89 | outputColumnName: String,
90 | expression: TransformFunction): DataFrame = {
91 | NestedArrayTransformations.nestedWithColumnMap(dataset, inputColumnName, outputColumnName, expression)
92 | }
93 |
94 | /**
95 | * Moves all fields of the specified struct up one level. This can only be envoked on a struct inside other struct
96 | *
97 | * {{{
98 | * root
99 | * |-- a: struct
100 | * | |-- b: struct
101 | * | | |-- c: string
102 | * | | |-- d: string
103 | *
104 | * df.nestedUnstruct("a.b")
105 | *
106 | * root
107 | * |-- a: struct
108 | * | |-- c: string
109 | * | |-- d: string
110 | * }}}
111 | *
112 | *
113 | * @param inputColumnName A struct column name that contains the fields to extract.
114 | * @return A dataframe with the struct removed and its fields are up one level.
115 | */
116 | def nestedUnstruct(inputColumnName: String): DataFrame = {
117 | NestedArrayTransformations.nestedUnstruct(dataset, inputColumnName)
118 | }
119 |
120 | }
121 |
122 | }
123 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/spark/hats/transformations/ArrayContext.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations
18 |
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.functions.col
21 | import za.co.absa.spark.hats.transformations.NestedArrayTransformations.splitByDeepestParent
22 |
23 | /**
24 | * The class provides a storage for array transformation context for a transformation of a dataframe field.
25 | * The context contains all arrays in the path of the field and their corresponding array element lambda variables
26 | * provided by 'transform()' function of Spark SQL.
27 | */
28 | private[transformations]
29 | class ArrayContext(val arrayPaths: Seq[String] = Array[String](),
30 | val lambdaVars: Seq[Column] = Array[Column]()) {
31 |
32 | /**
33 | * Returns a new context by appending the current context with a new array/lambda combination.
34 | *
35 | * @param arrayPath A fully-qualified array field name.
36 | * @param lambdaVar A lambda variable of the array element provided by 'transform()' function of Spark SQL.
37 | * @return A column that corresponds to the field name.
38 | */
39 | def withArraysUpdated(arrayPath: String, lambdaVar: Column): ArrayContext = {
40 | new ArrayContext(arrayPaths :+ arrayPath, lambdaVars :+ lambdaVar)
41 | }
42 |
43 | /**
44 | * Returns an instance of Column that corresponds to the input field's level of array nesting.
45 | *
46 | * @param fieldName A fully-qualified field name.
47 | * @return A column that corresponds to the field name.
48 | */
49 | def getField(fieldName: String): Column = {
50 | val (parentArray, childField) = splitByDeepestParent(fieldName, arrayPaths)
51 | if (parentArray.isEmpty) {
52 | col(childField)
53 | } else {
54 | val i = arrayPaths.indexOf(parentArray)
55 | if (fieldName == arrayPaths(i)) {
56 | // If the array itself is specified - return the array
57 | lambdaVars(i)
58 | } else {
59 | // If a field inside an array is specified - return the field
60 | // by using '.getField()' on each child (which could be a nested struct)
61 | childField.split('.')
62 | .foldLeft(lambdaVars(i))((parent, column) => parent.getField(column))
63 | }
64 | }
65 | }
66 | }
67 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/spark/hats/utils/JsonUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.utils
18 |
19 | import com.fasterxml.jackson.databind.ObjectMapper
20 | import org.apache.spark.sql.{DataFrame, SparkSession}
21 |
22 | object JsonUtils {
23 |
24 | /**
25 | * Formats a JSON string so it looks pretty.
26 | *
27 | * @param jsonIn A JSON string
28 | * @return A pretty formatted JSON string
29 | */
30 | def prettyJSON(jsonIn: String): String = {
31 | val mapper = new ObjectMapper()
32 |
33 | val jsonUnindented = mapper.readValue(jsonIn, classOf[Any])
34 | val indented = mapper.writerWithDefaultPrettyPrinter.writeValueAsString(jsonUnindented)
35 | indented.replace("\r\n", "\n")
36 | }
37 |
38 | /**
39 | * Formats a Spark-generated JSON strings that are returned by
40 | * applying `.toJSON.collect()` to a DataFrame.
41 | *
42 | * @param jsons A list of JSON documents
43 | * @return A pretty formatted JSON string
44 | */
45 | def prettySparkJSON(jsons: Seq[String]): String = {
46 | //val properJson = "[" + "}\n".r.replaceAllIn(jsonIn, "},\n") + "]"
47 | val singleJSON = jsons.mkString("[", ",", "]")
48 | prettyJSON(singleJSON)
49 | }
50 |
51 | /**
52 | * Creates a Spark DataFrame from a JSON document(s).
53 | *
54 | * @param json A json string to convert to a DataFrame
55 | * @return A data frame
56 | */
57 | def getDataFrameFromJson(spark: SparkSession, json: Seq[String]): DataFrame = {
58 | import spark.implicits._
59 | spark.read.json(json.toDS)
60 | }
61 | }
62 |
--------------------------------------------------------------------------------
/src/main/scala/za/co/absa/spark/hats/utils/SchemaUtils.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.utils
18 |
19 | import org.apache.spark.sql.types._
20 |
21 | import scala.annotation.tailrec
22 | import scala.util.Random
23 |
24 | object SchemaUtils {
25 |
26 | /**
27 | * For an array of arrays of arrays, ... get the final element type at the bottom of the array
28 | *
29 | * @param arrayType An array data type from a Spark dataframe schema
30 | * @return A non-array data type at the bottom of array nesting
31 | */
32 | @tailrec
33 | def getDeepestArrayType(arrayType: ArrayType): DataType = {
34 | arrayType.elementType match {
35 | case a: ArrayType => getDeepestArrayType(a)
36 | case b => b
37 | }
38 | }
39 |
40 | /**
41 | * Generate a unique column name
42 | *
43 | * @param prefix A prefix to use for the column name
44 | * @param schema An optional schema to validate if the column already exists (a very low probability)
45 | * @return A name that can be used as a unique column name
46 | */
47 | def getUniqueName(prefix: String, schema: Option[StructType]): String = {
48 | schema match {
49 | case None =>
50 | s"${prefix}_${Random.nextLong().abs}"
51 | case Some(sch) =>
52 | var exists = true
53 | var columnName = ""
54 | while (exists) {
55 | columnName = s"${prefix}_${Random.nextLong().abs}"
56 | exists = sch.fields.exists(_.name.compareToIgnoreCase(columnName) == 0)
57 | }
58 | columnName
59 | }
60 | }
61 |
62 | }
63 |
--------------------------------------------------------------------------------
/src/main/scala_2.11/za/co/absa/spark/hats/HofsWrapper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats
18 |
19 | import org.apache.spark.sql.Column
20 |
21 | import za.co.absa.spark.hofs.{transform => hofsTransform}
22 |
23 | /**
24 | * This is a wrapper for high order functions depending on Scala version.
25 | *
26 | * This implementation uses Hofs(https://github.com/AbsaOSS/spark-hofs).
27 | */
28 | object HofsWrapper {
29 | /**
30 | * Applies the function `f` to every element in the `array`. The method is an equivalent to the `map` function
31 | * from functional programming.
32 | *
33 | * @param array A column of arrays
34 | * @param f A function transforming individual elements of the array
35 | * @param elementName The name of the lambda variable. The value is used in Spark execution plans.
36 | * @return A column of arrays with transformed elements
37 | */
38 | def transform(
39 | array: Column,
40 | f: Column => Column,
41 | elementName: String): Column = {
42 | hofsTransform(array, f, elementName)
43 | }
44 | }
45 |
--------------------------------------------------------------------------------
/src/main/scala_2.12/za/co/absa/spark/hats/HofsWrapper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats
18 |
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.functions.{transform => sparkTransform}
21 |
22 | /**
23 | * This is a wrapper for high order functions depending on Scala version.
24 | *
25 | * This implementation uses native Spark transform().
26 | */
27 | object HofsWrapper {
28 | /**
29 | * Applies the function `f` to every element in the `array`. The method is an equivalent to the `map` function
30 | * from functional programming.
31 | *
32 | * @param array A column of arrays
33 | * @param f A function transforming individual elements of the array
34 | * @param elementName The name of the lambda variable. The value is used in Spark execution plans.
35 | * @return A column of arrays with transformed elements
36 | */
37 | def transform(
38 | array: Column,
39 | f: Column => Column,
40 | elementName: String): Column = {
41 | sparkTransform(array, f)
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/main/scala_2.13/za/co/absa/spark/hats/HofsWrapper.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats
18 |
19 | import org.apache.spark.sql.Column
20 | import org.apache.spark.sql.functions.{transform => sparkTransform}
21 |
22 | /**
23 | * This is a wrapper for high order functions depending on Scala version.
24 | *
25 | * This implementation uses native Spark transform().
26 | */
27 | object HofsWrapper {
28 | /**
29 | * Applies the function `f` to every element in the `array`. The method is an equivalent to the `map` function
30 | * from functional programming.
31 | *
32 | * @param array A column of arrays
33 | * @param f A function transforming individual elements of the array
34 | * @param elementName The name of the lambda variable. The value is used in Spark execution plans.
35 | * @return A column of arrays with transformed elements
36 | */
37 | def transform(
38 | array: Column,
39 | f: Column => Column,
40 | elementName: String): Column = {
41 | sparkTransform(array, f)
42 | }
43 | }
44 |
--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Copyright 2020 ABSA Group Limited
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | log4j.rootCategory=INFO, console
15 | log4j.appender.console=org.apache.log4j.ConsoleAppender
16 | log4j.appender.console.target=System.err
17 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
18 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
19 | log4j.appender.console.Threshold=ERROR
20 |
--------------------------------------------------------------------------------
/src/test/resources/log4j2.properties:
--------------------------------------------------------------------------------
1 | # Copyright 2020 ABSA Group Limited
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | # http://www.apache.org/licenses/LICENSE-2.0
7 | #
8 | # Unless required by applicable law or agreed to in writing, software
9 | # distributed under the License is distributed on an "AS IS" BASIS,
10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | # See the License for the specific language governing permissions and
12 | # limitations under the License.
13 |
14 | log4j.rootCategory=INFO, console
15 | log4j.appender.console=org.apache.log4j.ConsoleAppender
16 | log4j.appender.console.target=System.err
17 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
18 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
19 | log4j.appender.console.Threshold=ERROR
20 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested10Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "struct3" : {
3 | "inner3" : {
4 | "array3" : [ {
5 | "a1" : 3,
6 | "a2" : 1,
7 | "a3" : "1",
8 | "out" : "3 1"
9 | }, {
10 | "a1" : 4,
11 | "a2" : 2,
12 | "a3" : "5",
13 | "out" : "4 2"
14 | } ]
15 | }
16 | },
17 | "errCol" : [ {
18 | "errType" : "Initial",
19 | "errCode" : "000",
20 | "errMsg" : "ErrMsg",
21 | "errCol" : "id",
22 | "rawValues" : [ ],
23 | "mappings" : [ ]
24 | }, {
25 | "errType" : "confCastError",
26 | "errCode" : "E00003",
27 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
28 | "errCol" : "a1!==3",
29 | "rawValues" : [ "4" ],
30 | "mappings" : [ ]
31 | } ]
32 | }, {
33 | "struct3" : {
34 | "inner3" : {
35 | "array3" : [ {
36 | "a1" : 4,
37 | "a2" : 2,
38 | "a3" : "3",
39 | "out" : "4 2"
40 | }, {
41 | "a1" : 8,
42 | "a2" : 2,
43 | "a3" : "5",
44 | "out" : "8 2"
45 | } ]
46 | }
47 | },
48 | "errCol" : [ {
49 | "errType" : "Initial",
50 | "errCode" : "000",
51 | "errMsg" : "ErrMsg",
52 | "errCol" : "id",
53 | "rawValues" : [ ],
54 | "mappings" : [ ]
55 | }, {
56 | "errType" : "confCastError",
57 | "errCode" : "E00003",
58 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
59 | "errCol" : "a1!==3",
60 | "rawValues" : [ "4" ],
61 | "mappings" : [ ]
62 | }, {
63 | "errType" : "confCastError",
64 | "errCode" : "E00003",
65 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
66 | "errCol" : "a1!==3",
67 | "rawValues" : [ "8" ],
68 | "mappings" : [ ]
69 | } ]
70 | }, {
71 | "struct3" : {
72 | "inner3" : {
73 | "array3" : [ {
74 | "a1" : 5,
75 | "a2" : 3,
76 | "a3" : "4",
77 | "out" : "5 3"
78 | }, {
79 | "a1" : 8,
80 | "a2" : 4,
81 | "a3" : "7",
82 | "out" : "8 4"
83 | } ]
84 | }
85 | },
86 | "errCol" : [ {
87 | "errType" : "Initial",
88 | "errCode" : "000",
89 | "errMsg" : "ErrMsg",
90 | "errCol" : "id",
91 | "rawValues" : [ ],
92 | "mappings" : [ ]
93 | }, {
94 | "errType" : "confCastError",
95 | "errCode" : "E00003",
96 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
97 | "errCol" : "a1!==3",
98 | "rawValues" : [ "5" ],
99 | "mappings" : [ ]
100 | }, {
101 | "errType" : "confCastError",
102 | "errCode" : "E00003",
103 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
104 | "errCol" : "a1!==3",
105 | "rawValues" : [ "8" ],
106 | "mappings" : [ ]
107 | } ]
108 | }, {
109 | "struct3" : {
110 | "inner3" : {
111 | "array3" : [ {
112 | "a1" : 6,
113 | "a2" : 4,
114 | "a3" : "6",
115 | "out" : "6 4"
116 | }, {
117 | "a1" : 9,
118 | "a2" : 3,
119 | "a3" : "7",
120 | "out" : "9 3"
121 | } ]
122 | }
123 | },
124 | "errCol" : [ {
125 | "errType" : "Initial",
126 | "errCode" : "000",
127 | "errMsg" : "ErrMsg",
128 | "errCol" : "id",
129 | "rawValues" : [ ],
130 | "mappings" : [ ]
131 | }, {
132 | "errType" : "confCastError",
133 | "errCode" : "E00003",
134 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
135 | "errCol" : "a1!==3",
136 | "rawValues" : [ "6" ],
137 | "mappings" : [ ]
138 | }, {
139 | "errType" : "confCastError",
140 | "errCode" : "E00003",
141 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
142 | "errCol" : "a1!==3",
143 | "rawValues" : [ "9" ],
144 | "mappings" : [ ]
145 | } ]
146 | }, {
147 | "struct3" : {
148 | "inner3" : {
149 | "array3" : [ {
150 | "a1" : 7,
151 | "a2" : 5,
152 | "a3" : "7",
153 | "out" : "7 5"
154 | } ]
155 | }
156 | },
157 | "errCol" : [ {
158 | "errType" : "Initial",
159 | "errCode" : "000",
160 | "errMsg" : "ErrMsg",
161 | "errCol" : "id",
162 | "rawValues" : [ ],
163 | "mappings" : [ ]
164 | }, {
165 | "errType" : "confCastError",
166 | "errCode" : "E00003",
167 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
168 | "errCol" : "a1!==3",
169 | "rawValues" : [ "7" ],
170 | "mappings" : [ ]
171 | } ]
172 | }, {
173 | "struct3" : {
174 | "inner3" : {
175 | "array3" : [ {
176 | "a1" : 4,
177 | "a2" : 6,
178 | "a3" : "5",
179 | "out" : "4 6"
180 | } ]
181 | }
182 | },
183 | "errCol" : [ {
184 | "errType" : "Initial",
185 | "errCode" : "000",
186 | "errMsg" : "ErrMsg",
187 | "errCol" : "id",
188 | "rawValues" : [ ],
189 | "mappings" : [ ]
190 | }, {
191 | "errType" : "confCastError",
192 | "errCode" : "E00003",
193 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
194 | "errCol" : "a1!==3",
195 | "rawValues" : [ "4" ],
196 | "mappings" : [ ]
197 | } ]
198 | }, {
199 | "struct3" : {
200 | "inner3" : { }
201 | },
202 | "errCol" : [ {
203 | "errType" : "Initial",
204 | "errCode" : "000",
205 | "errMsg" : "ErrMsg",
206 | "errCol" : "id",
207 | "rawValues" : [ ],
208 | "mappings" : [ ]
209 | } ]
210 | }, {
211 | "struct3" : {
212 | "inner3" : { }
213 | },
214 | "errCol" : [ {
215 | "errType" : "Initial",
216 | "errCode" : "000",
217 | "errMsg" : "ErrMsg",
218 | "errCol" : "id",
219 | "rawValues" : [ ],
220 | "mappings" : [ ]
221 | } ]
222 | } ]
223 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested10Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- struct3: struct (nullable = false)
3 | | |-- inner3: struct (nullable = false)
4 | | | |-- array3: array (nullable = true)
5 | | | | |-- element: struct (containsNull = false)
6 | | | | | |-- a1: long (nullable = true)
7 | | | | | |-- a2: long (nullable = true)
8 | | | | | |-- a3: string (nullable = true)
9 | | | | | |-- out: string (nullable = true)
10 | |-- errCol: array (nullable = true)
11 | | |-- element: struct (containsNull = true)
12 | | | |-- errType: string (nullable = true)
13 | | | |-- errCode: string (nullable = true)
14 | | | |-- errMsg: string (nullable = true)
15 | | | |-- errCol: string (nullable = true)
16 | | | |-- rawValues: array (nullable = true)
17 | | | | |-- element: string (containsNull = true)
18 | | | |-- mappings: array (nullable = true)
19 | | | | |-- element: struct (containsNull = true)
20 | | | | | |-- mappingTableColumn: string (nullable = true)
21 | | | | | |-- mappedDatasetColumn: string (nullable = true)
22 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested1Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "id" : 1,
3 | "key1" : 1,
4 | "key2" : 2,
5 | "struct1" : {
6 | "key3" : 3,
7 | "key4" : 1
8 | },
9 | "struct2" : {
10 | "inner1" : {
11 | "key5" : 3,
12 | "key6" : 1,
13 | "skey1" : "1"
14 | }
15 | },
16 | "struct3" : {
17 | "inner3" : {
18 | "array3" : [ {
19 | "a1" : 3,
20 | "a2" : 1,
21 | "a3" : "1"
22 | }, {
23 | "a1" : 4,
24 | "a2" : 2,
25 | "a3" : "5"
26 | } ]
27 | }
28 | },
29 | "array1" : [ {
30 | "key7" : 2,
31 | "key8" : 3,
32 | "skey2" : "1"
33 | }, {
34 | "key7" : 1,
35 | "key8" : 2,
36 | "skey2" : "2"
37 | }, {
38 | "key7" : 3,
39 | "key8" : 3,
40 | "skey2" : "3"
41 | } ],
42 | "array2" : [ {
43 | "key2" : 1,
44 | "inner2" : [ {
45 | "key9" : 1,
46 | "key10" : 2,
47 | "struct3" : {
48 | "k1" : 1,
49 | "k2" : 2
50 | }
51 | }, {
52 | "key9" : 1,
53 | "key10" : 2,
54 | "struct3" : {
55 | "k1" : 2,
56 | "k2" : 2
57 | }
58 | } ]
59 | }, {
60 | "key2" : 2,
61 | "inner2" : [ {
62 | "key9" : 3,
63 | "key10" : 1,
64 | "struct3" : {
65 | "k1" : 1,
66 | "k2" : 2
67 | }
68 | }, {
69 | "key9" : 2,
70 | "key10" : 2,
71 | "struct3" : {
72 | "k1" : 3,
73 | "k2" : 3
74 | }
75 | } ]
76 | } ],
77 | "id_str" : "1 1 2"
78 | }, {
79 | "id" : 2,
80 | "key1" : 2,
81 | "key2" : 1,
82 | "struct1" : {
83 | "key3" : 2,
84 | "key4" : 3
85 | },
86 | "struct2" : {
87 | "inner1" : {
88 | "key5" : 2,
89 | "key6" : 3,
90 | "skey1" : "2"
91 | }
92 | },
93 | "struct3" : {
94 | "inner3" : {
95 | "array3" : [ {
96 | "a1" : 4,
97 | "a2" : 2,
98 | "a3" : "3"
99 | }, {
100 | "a1" : 8,
101 | "a2" : 2,
102 | "a3" : "5"
103 | } ]
104 | }
105 | },
106 | "array1" : [ {
107 | "key7" : 4,
108 | "key8" : 2,
109 | "skey2" : "2"
110 | }, {
111 | "key7" : 3,
112 | "key8" : 1,
113 | "skey2" : "3"
114 | }, {
115 | "key7" : 3,
116 | "key8" : 3,
117 | "skey2" : "3"
118 | } ],
119 | "array2" : [ {
120 | "key2" : 2,
121 | "inner2" : [ {
122 | "key9" : 1,
123 | "key10" : 2,
124 | "struct3" : {
125 | "k1" : 1,
126 | "k2" : 1
127 | }
128 | }, {
129 | "key9" : 1,
130 | "key10" : 2,
131 | "struct3" : {
132 | "k1" : 1,
133 | "k2" : 1
134 | }
135 | } ]
136 | }, {
137 | "key2" : 3,
138 | "inner2" : [ {
139 | "key9" : 3,
140 | "key10" : 1,
141 | "struct3" : {
142 | "k1" : 2,
143 | "k2" : 1
144 | }
145 | }, {
146 | "key9" : 4,
147 | "key10" : 1,
148 | "struct3" : {
149 | "k1" : 3,
150 | "k2" : 3
151 | }
152 | } ]
153 | } ],
154 | "id_str" : "2 2 1"
155 | }, {
156 | "id" : 3,
157 | "key1" : 3,
158 | "key2" : 2,
159 | "struct1" : {
160 | "key3" : 1,
161 | "key4" : 2
162 | },
163 | "struct2" : {
164 | "inner1" : {
165 | "key5" : 1,
166 | "key6" : 2,
167 | "skey1" : "3"
168 | }
169 | },
170 | "struct3" : {
171 | "inner3" : {
172 | "array3" : [ {
173 | "a1" : 5,
174 | "a2" : 3,
175 | "a3" : "4"
176 | }, {
177 | "a1" : 8,
178 | "a2" : 4,
179 | "a3" : "7"
180 | } ]
181 | }
182 | },
183 | "array1" : [ ],
184 | "array2" : [ {
185 | "key2" : 3,
186 | "inner2" : [ {
187 | "key9" : 2,
188 | "key10" : 3,
189 | "struct3" : {
190 | "k1" : 2,
191 | "k2" : 2
192 | }
193 | }, {
194 | "key9" : 2,
195 | "key10" : 1
196 | } ]
197 | }, {
198 | "key2" : 2,
199 | "inner2" : [ {
200 | "key9" : 3,
201 | "key10" : 1,
202 | "struct3" : {
203 | "k1" : 1,
204 | "k2" : 2
205 | }
206 | }, {
207 | "key9" : 2,
208 | "key10" : 1,
209 | "struct3" : {
210 | "k1" : 1,
211 | "k2" : 1
212 | }
213 | } ]
214 | } ],
215 | "id_str" : "3 3 2"
216 | }, {
217 | "id" : 4,
218 | "key1" : 2,
219 | "key2" : 3,
220 | "struct1" : {
221 | "key3" : 2,
222 | "key4" : 1
223 | },
224 | "struct2" : {
225 | "inner1" : {
226 | "key5" : 3,
227 | "key6" : 2,
228 | "skey1" : "2"
229 | }
230 | },
231 | "struct3" : {
232 | "inner3" : {
233 | "array3" : [ {
234 | "a1" : 6,
235 | "a2" : 4,
236 | "a3" : "6"
237 | }, {
238 | "a1" : 9,
239 | "a2" : 3,
240 | "a3" : "7"
241 | } ]
242 | }
243 | },
244 | "array1" : [ ],
245 | "array2" : [ {
246 | "key2" : 4,
247 | "inner2" : [ ]
248 | }, {
249 | "key2" : 1,
250 | "inner2" : [ {
251 | "key9" : 2,
252 | "key10" : 2,
253 | "struct3" : {
254 | "k1" : 1,
255 | "k2" : 1
256 | }
257 | } ]
258 | } ],
259 | "id_str" : "4 2 3"
260 | }, {
261 | "id" : 5,
262 | "key1" : 4,
263 | "key2" : 1,
264 | "struct1" : {
265 | "key3" : 3,
266 | "key4" : 3
267 | },
268 | "struct2" : {
269 | "inner1" : {
270 | "key5" : 2,
271 | "key6" : 1,
272 | "skey1" : "3"
273 | }
274 | },
275 | "struct3" : {
276 | "inner3" : {
277 | "array3" : [ {
278 | "a1" : 7,
279 | "a2" : 5,
280 | "a3" : "7"
281 | } ]
282 | }
283 | },
284 | "array1" : [ ],
285 | "array2" : [ ],
286 | "id_str" : "5 4 1"
287 | }, {
288 | "id" : 6,
289 | "key1" : 1,
290 | "key2" : 3,
291 | "struct1" : {
292 | "key3" : 1,
293 | "key4" : 2
294 | },
295 | "struct2" : {
296 | "inner1" : {
297 | "key5" : 1,
298 | "key6" : 2,
299 | "skey1" : "4"
300 | }
301 | },
302 | "struct3" : {
303 | "inner3" : {
304 | "array3" : [ {
305 | "a1" : 4,
306 | "a2" : 6,
307 | "a3" : "5"
308 | } ]
309 | }
310 | },
311 | "array1" : [ ],
312 | "array2" : [ ],
313 | "id_str" : "6 1 3"
314 | }, {
315 | "id" : 7,
316 | "key1" : 1,
317 | "key2" : 3,
318 | "struct1" : {
319 | "key3" : 1,
320 | "key4" : 2
321 | },
322 | "struct2" : {
323 | "inner1" : {
324 | "key5" : 1
325 | }
326 | },
327 | "array1" : [ ],
328 | "array2" : [ ],
329 | "id_str" : "7 1 3"
330 | }, {
331 | "id" : 8,
332 | "key1" : 1,
333 | "struct1" : {
334 | "key3" : 1
335 | }
336 | } ]
337 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested1Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- id: long (nullable = true)
3 | |-- key1: long (nullable = true)
4 | |-- key2: long (nullable = true)
5 | |-- struct1: struct (nullable = true)
6 | | |-- key3: integer (nullable = true)
7 | | |-- key4: integer (nullable = true)
8 | |-- struct2: struct (nullable = true)
9 | | |-- inner1: struct (nullable = true)
10 | | | |-- key5: long (nullable = true)
11 | | | |-- key6: long (nullable = true)
12 | | | |-- skey1: string (nullable = true)
13 | |-- struct3: struct (nullable = true)
14 | | |-- inner3: struct (nullable = true)
15 | | | |-- array3: array (nullable = true)
16 | | | | |-- element: struct (containsNull = true)
17 | | | | | |-- a1: long (nullable = true)
18 | | | | | |-- a2: long (nullable = true)
19 | | | | | |-- a3: string (nullable = true)
20 | |-- array1: array (nullable = true)
21 | | |-- element: struct (containsNull = true)
22 | | | |-- key7: long (nullable = true)
23 | | | |-- key8: long (nullable = true)
24 | | | |-- skey2: string (nullable = true)
25 | |-- array2: array (nullable = true)
26 | | |-- element: struct (containsNull = true)
27 | | | |-- key2: long (nullable = true)
28 | | | |-- inner2: array (nullable = true)
29 | | | | |-- element: struct (containsNull = true)
30 | | | | | |-- key9: long (nullable = true)
31 | | | | | |-- key10: long (nullable = true)
32 | | | | | |-- struct3: struct (nullable = true)
33 | | | | | | |-- k1: integer (nullable = true)
34 | | | | | | |-- k2: integer (nullable = true)
35 | |-- id_str: string (nullable = true)
36 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested2Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "key1" : 1,
3 | "struct2" : {
4 | "inner1" : {
5 | "key5" : 3,
6 | "key6" : 1,
7 | "skey1" : "1"
8 | },
9 | "skey2" : "1 3 1"
10 | }
11 | }, {
12 | "key1" : 2,
13 | "struct2" : {
14 | "inner1" : {
15 | "key5" : 2,
16 | "key6" : 3,
17 | "skey1" : "2"
18 | },
19 | "skey2" : "2 2 3"
20 | }
21 | }, {
22 | "key1" : 3,
23 | "struct2" : {
24 | "inner1" : {
25 | "key5" : 1,
26 | "key6" : 2,
27 | "skey1" : "3"
28 | },
29 | "skey2" : "3 1 2"
30 | }
31 | }, {
32 | "key1" : 2,
33 | "struct2" : {
34 | "inner1" : {
35 | "key5" : 3,
36 | "key6" : 2,
37 | "skey1" : "2"
38 | },
39 | "skey2" : "2 3 2"
40 | }
41 | }, {
42 | "key1" : 4,
43 | "struct2" : {
44 | "inner1" : {
45 | "key5" : 2,
46 | "key6" : 1,
47 | "skey1" : "3"
48 | },
49 | "skey2" : "4 2 1"
50 | }
51 | }, {
52 | "key1" : 1,
53 | "struct2" : {
54 | "inner1" : {
55 | "key5" : 1,
56 | "key6" : 2,
57 | "skey1" : "4"
58 | },
59 | "skey2" : "1 1 2"
60 | }
61 | }, {
62 | "key1" : 1,
63 | "struct2" : {
64 | "inner1" : {
65 | "key5" : 1
66 | }
67 | }
68 | }, {
69 | "key1" : 1,
70 | "struct2" : { }
71 | } ]
72 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested2Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- key1: long (nullable = true)
3 | |-- struct2: struct (nullable = false)
4 | | |-- inner1: struct (nullable = true)
5 | | | |-- key5: long (nullable = true)
6 | | | |-- key6: long (nullable = true)
7 | | | |-- skey1: string (nullable = true)
8 | | |-- skey2: string (nullable = true)
9 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested3Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "key1" : 1,
3 | "struct2" : {
4 | "inner1" : {
5 | "key5" : 3,
6 | "key6" : 1,
7 | "skey1" : "1",
8 | "skey2" : "1 3 1"
9 | }
10 | }
11 | }, {
12 | "key1" : 2,
13 | "struct2" : {
14 | "inner1" : {
15 | "key5" : 2,
16 | "key6" : 3,
17 | "skey1" : "2",
18 | "skey2" : "2 2 3"
19 | }
20 | }
21 | }, {
22 | "key1" : 3,
23 | "struct2" : {
24 | "inner1" : {
25 | "key5" : 1,
26 | "key6" : 2,
27 | "skey1" : "3",
28 | "skey2" : "3 1 2"
29 | }
30 | }
31 | }, {
32 | "key1" : 2,
33 | "struct2" : {
34 | "inner1" : {
35 | "key5" : 3,
36 | "key6" : 2,
37 | "skey1" : "2",
38 | "skey2" : "2 3 2"
39 | }
40 | }
41 | }, {
42 | "key1" : 4,
43 | "struct2" : {
44 | "inner1" : {
45 | "key5" : 2,
46 | "key6" : 1,
47 | "skey1" : "3",
48 | "skey2" : "4 2 1"
49 | }
50 | }
51 | }, {
52 | "key1" : 1,
53 | "struct2" : {
54 | "inner1" : {
55 | "key5" : 1,
56 | "key6" : 2,
57 | "skey1" : "4",
58 | "skey2" : "1 1 2"
59 | }
60 | }
61 | }, {
62 | "key1" : 1,
63 | "struct2" : {
64 | "inner1" : {
65 | "key5" : 1
66 | }
67 | }
68 | }, {
69 | "key1" : 1,
70 | "struct2" : {
71 | "inner1" : { }
72 | }
73 | } ]
74 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested3Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- key1: long (nullable = true)
3 | |-- struct2: struct (nullable = false)
4 | | |-- inner1: struct (nullable = false)
5 | | | |-- key5: long (nullable = true)
6 | | | |-- key6: long (nullable = true)
7 | | | |-- skey1: string (nullable = true)
8 | | | |-- skey2: string (nullable = true)
9 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested4Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "key1" : 1,
3 | "array1" : [ {
4 | "key7" : 2,
5 | "key8" : 3,
6 | "skey2" : "1",
7 | "skey3" : "1 2 3"
8 | }, {
9 | "key7" : 1,
10 | "key8" : 2,
11 | "skey2" : "2",
12 | "skey3" : "1 1 2"
13 | }, {
14 | "key7" : 3,
15 | "key8" : 3,
16 | "skey2" : "3",
17 | "skey3" : "1 3 3"
18 | } ]
19 | }, {
20 | "key1" : 2,
21 | "array1" : [ {
22 | "key7" : 4,
23 | "key8" : 2,
24 | "skey2" : "2",
25 | "skey3" : "2 4 2"
26 | }, {
27 | "key7" : 3,
28 | "key8" : 1,
29 | "skey2" : "3",
30 | "skey3" : "2 3 1"
31 | }, {
32 | "key7" : 3,
33 | "key8" : 3,
34 | "skey2" : "3",
35 | "skey3" : "2 3 3"
36 | } ]
37 | }, {
38 | "key1" : 3,
39 | "array1" : [ ]
40 | }, {
41 | "key1" : 2,
42 | "array1" : [ ]
43 | }, {
44 | "key1" : 4,
45 | "array1" : [ ]
46 | }, {
47 | "key1" : 1,
48 | "array1" : [ ]
49 | }, {
50 | "key1" : 1,
51 | "array1" : [ ]
52 | }, {
53 | "key1" : 1
54 | } ]
55 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested4Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- key1: long (nullable = true)
3 | |-- array1: array (nullable = true)
4 | | |-- element: struct (containsNull = false)
5 | | | |-- key7: long (nullable = true)
6 | | | |-- key8: long (nullable = true)
7 | | | |-- skey2: string (nullable = true)
8 | | | |-- skey3: string (nullable = true)
9 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested5Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "key1" : 1,
3 | "array2" : [ {
4 | "key2" : 1,
5 | "inner2" : [ {
6 | "key9" : 1,
7 | "key10" : 2,
8 | "struct3" : {
9 | "k1" : 1,
10 | "k2" : 2
11 | },
12 | "out" : "1 1 1 2"
13 | }, {
14 | "key9" : 1,
15 | "key10" : 2,
16 | "struct3" : {
17 | "k1" : 2,
18 | "k2" : 2
19 | },
20 | "out" : "1 1 1 2"
21 | } ]
22 | }, {
23 | "key2" : 2,
24 | "inner2" : [ {
25 | "key9" : 3,
26 | "key10" : 1,
27 | "struct3" : {
28 | "k1" : 1,
29 | "k2" : 2
30 | },
31 | "out" : "1 2 3 1"
32 | }, {
33 | "key9" : 2,
34 | "key10" : 2,
35 | "struct3" : {
36 | "k1" : 3,
37 | "k2" : 3
38 | },
39 | "out" : "1 2 2 2"
40 | } ]
41 | } ]
42 | }, {
43 | "key1" : 2,
44 | "array2" : [ {
45 | "key2" : 2,
46 | "inner2" : [ {
47 | "key9" : 1,
48 | "key10" : 2,
49 | "struct3" : {
50 | "k1" : 1,
51 | "k2" : 1
52 | },
53 | "out" : "2 2 1 2"
54 | }, {
55 | "key9" : 1,
56 | "key10" : 2,
57 | "struct3" : {
58 | "k1" : 1,
59 | "k2" : 1
60 | },
61 | "out" : "2 2 1 2"
62 | } ]
63 | }, {
64 | "key2" : 3,
65 | "inner2" : [ {
66 | "key9" : 3,
67 | "key10" : 1,
68 | "struct3" : {
69 | "k1" : 2,
70 | "k2" : 1
71 | },
72 | "out" : "2 3 3 1"
73 | }, {
74 | "key9" : 4,
75 | "key10" : 1,
76 | "struct3" : {
77 | "k1" : 3,
78 | "k2" : 3
79 | },
80 | "out" : "2 3 4 1"
81 | } ]
82 | } ]
83 | }, {
84 | "key1" : 3,
85 | "array2" : [ {
86 | "key2" : 3,
87 | "inner2" : [ {
88 | "key9" : 2,
89 | "key10" : 3,
90 | "struct3" : {
91 | "k1" : 2,
92 | "k2" : 2
93 | },
94 | "out" : "3 3 2 3"
95 | }, {
96 | "key9" : 2,
97 | "key10" : 1,
98 | "out" : "3 3 2 1"
99 | } ]
100 | }, {
101 | "key2" : 2,
102 | "inner2" : [ {
103 | "key9" : 3,
104 | "key10" : 1,
105 | "struct3" : {
106 | "k1" : 1,
107 | "k2" : 2
108 | },
109 | "out" : "3 2 3 1"
110 | }, {
111 | "key9" : 2,
112 | "key10" : 1,
113 | "struct3" : {
114 | "k1" : 1,
115 | "k2" : 1
116 | },
117 | "out" : "3 2 2 1"
118 | } ]
119 | } ]
120 | }, {
121 | "key1" : 2,
122 | "array2" : [ {
123 | "key2" : 4,
124 | "inner2" : [ ]
125 | }, {
126 | "key2" : 1,
127 | "inner2" : [ {
128 | "key9" : 2,
129 | "key10" : 2,
130 | "struct3" : {
131 | "k1" : 1,
132 | "k2" : 1
133 | },
134 | "out" : "2 1 2 2"
135 | } ]
136 | } ]
137 | }, {
138 | "key1" : 4,
139 | "array2" : [ ]
140 | }, {
141 | "key1" : 1,
142 | "array2" : [ ]
143 | }, {
144 | "key1" : 1,
145 | "array2" : [ ]
146 | }, {
147 | "key1" : 1
148 | } ]
149 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested5Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- key1: long (nullable = true)
3 | |-- array2: array (nullable = true)
4 | | |-- element: struct (containsNull = false)
5 | | | |-- key2: long (nullable = true)
6 | | | |-- inner2: array (nullable = true)
7 | | | | |-- element: struct (containsNull = false)
8 | | | | | |-- key9: long (nullable = true)
9 | | | | | |-- key10: long (nullable = true)
10 | | | | | |-- struct3: struct (nullable = true)
11 | | | | | | |-- k1: integer (nullable = true)
12 | | | | | | |-- k2: integer (nullable = true)
13 | | | | | |-- out: string (nullable = true)
14 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested6Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "array2" : [ {
3 | "key2" : 1,
4 | "inner2" : [ {
5 | "key9" : 1,
6 | "key10" : 2,
7 | "struct3" : {
8 | "k1" : 1,
9 | "k2" : 2
10 | },
11 | "out" : "2 1"
12 | }, {
13 | "key9" : 1,
14 | "key10" : 2,
15 | "struct3" : {
16 | "k1" : 2,
17 | "k2" : 2
18 | },
19 | "out" : "2 2"
20 | } ]
21 | }, {
22 | "key2" : 2,
23 | "inner2" : [ {
24 | "key9" : 3,
25 | "key10" : 1,
26 | "struct3" : {
27 | "k1" : 1,
28 | "k2" : 2
29 | },
30 | "out" : "1 1"
31 | }, {
32 | "key9" : 2,
33 | "key10" : 2,
34 | "struct3" : {
35 | "k1" : 3,
36 | "k2" : 3
37 | },
38 | "out" : "2 3"
39 | } ]
40 | } ]
41 | }, {
42 | "array2" : [ {
43 | "key2" : 2,
44 | "inner2" : [ {
45 | "key9" : 1,
46 | "key10" : 2,
47 | "struct3" : {
48 | "k1" : 1,
49 | "k2" : 1
50 | },
51 | "out" : "2 1"
52 | }, {
53 | "key9" : 1,
54 | "key10" : 2,
55 | "struct3" : {
56 | "k1" : 1,
57 | "k2" : 1
58 | },
59 | "out" : "2 1"
60 | } ]
61 | }, {
62 | "key2" : 3,
63 | "inner2" : [ {
64 | "key9" : 3,
65 | "key10" : 1,
66 | "struct3" : {
67 | "k1" : 2,
68 | "k2" : 1
69 | },
70 | "out" : "1 2"
71 | }, {
72 | "key9" : 4,
73 | "key10" : 1,
74 | "struct3" : {
75 | "k1" : 3,
76 | "k2" : 3
77 | },
78 | "out" : "1 3"
79 | } ]
80 | } ]
81 | }, {
82 | "array2" : [ {
83 | "key2" : 3,
84 | "inner2" : [ {
85 | "key9" : 2,
86 | "key10" : 3,
87 | "struct3" : {
88 | "k1" : 2,
89 | "k2" : 2
90 | },
91 | "out" : "3 2"
92 | }, {
93 | "key9" : 2,
94 | "key10" : 1
95 | } ]
96 | }, {
97 | "key2" : 2,
98 | "inner2" : [ {
99 | "key9" : 3,
100 | "key10" : 1,
101 | "struct3" : {
102 | "k1" : 1,
103 | "k2" : 2
104 | },
105 | "out" : "1 1"
106 | }, {
107 | "key9" : 2,
108 | "key10" : 1,
109 | "struct3" : {
110 | "k1" : 1,
111 | "k2" : 1
112 | },
113 | "out" : "1 1"
114 | } ]
115 | } ]
116 | }, {
117 | "array2" : [ {
118 | "key2" : 4,
119 | "inner2" : [ ]
120 | }, {
121 | "key2" : 1,
122 | "inner2" : [ {
123 | "key9" : 2,
124 | "key10" : 2,
125 | "struct3" : {
126 | "k1" : 1,
127 | "k2" : 1
128 | },
129 | "out" : "2 1"
130 | } ]
131 | } ]
132 | }, {
133 | "array2" : [ ]
134 | }, {
135 | "array2" : [ ]
136 | }, {
137 | "array2" : [ ]
138 | }, { } ]
139 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested6Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- array2: array (nullable = true)
3 | | |-- element: struct (containsNull = false)
4 | | | |-- key2: long (nullable = true)
5 | | | |-- inner2: array (nullable = true)
6 | | | | |-- element: struct (containsNull = false)
7 | | | | | |-- key9: long (nullable = true)
8 | | | | | |-- key10: long (nullable = true)
9 | | | | | |-- struct3: struct (nullable = true)
10 | | | | | | |-- k1: integer (nullable = true)
11 | | | | | | |-- k2: integer (nullable = true)
12 | | | | | |-- out: string (nullable = true)
13 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested7Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "array2" : [ {
3 | "key2" : 1,
4 | "inner2" : [ {
5 | "key9" : 1,
6 | "key10" : 2,
7 | "struct3" : {
8 | "k1" : 1,
9 | "k2" : 2,
10 | "out" : "1 2"
11 | }
12 | }, {
13 | "key9" : 1,
14 | "key10" : 2,
15 | "struct3" : {
16 | "k1" : 2,
17 | "k2" : 2,
18 | "out" : "2 2"
19 | }
20 | } ]
21 | }, {
22 | "key2" : 2,
23 | "inner2" : [ {
24 | "key9" : 3,
25 | "key10" : 1,
26 | "struct3" : {
27 | "k1" : 1,
28 | "k2" : 2,
29 | "out" : "1 1"
30 | }
31 | }, {
32 | "key9" : 2,
33 | "key10" : 2,
34 | "struct3" : {
35 | "k1" : 3,
36 | "k2" : 3,
37 | "out" : "3 2"
38 | }
39 | } ]
40 | } ]
41 | }, {
42 | "array2" : [ {
43 | "key2" : 2,
44 | "inner2" : [ {
45 | "key9" : 1,
46 | "key10" : 2,
47 | "struct3" : {
48 | "k1" : 1,
49 | "k2" : 1,
50 | "out" : "1 2"
51 | }
52 | }, {
53 | "key9" : 1,
54 | "key10" : 2,
55 | "struct3" : {
56 | "k1" : 1,
57 | "k2" : 1,
58 | "out" : "1 2"
59 | }
60 | } ]
61 | }, {
62 | "key2" : 3,
63 | "inner2" : [ {
64 | "key9" : 3,
65 | "key10" : 1,
66 | "struct3" : {
67 | "k1" : 2,
68 | "k2" : 1,
69 | "out" : "2 1"
70 | }
71 | }, {
72 | "key9" : 4,
73 | "key10" : 1,
74 | "struct3" : {
75 | "k1" : 3,
76 | "k2" : 3,
77 | "out" : "3 1"
78 | }
79 | } ]
80 | } ]
81 | }, {
82 | "array2" : [ {
83 | "key2" : 3,
84 | "inner2" : [ {
85 | "key9" : 2,
86 | "key10" : 3,
87 | "struct3" : {
88 | "k1" : 2,
89 | "k2" : 2,
90 | "out" : "2 3"
91 | }
92 | }, {
93 | "key9" : 2,
94 | "key10" : 1,
95 | "struct3" : { }
96 | } ]
97 | }, {
98 | "key2" : 2,
99 | "inner2" : [ {
100 | "key9" : 3,
101 | "key10" : 1,
102 | "struct3" : {
103 | "k1" : 1,
104 | "k2" : 2,
105 | "out" : "1 1"
106 | }
107 | }, {
108 | "key9" : 2,
109 | "key10" : 1,
110 | "struct3" : {
111 | "k1" : 1,
112 | "k2" : 1,
113 | "out" : "1 1"
114 | }
115 | } ]
116 | } ]
117 | }, {
118 | "array2" : [ {
119 | "key2" : 4,
120 | "inner2" : [ ]
121 | }, {
122 | "key2" : 1,
123 | "inner2" : [ {
124 | "key9" : 2,
125 | "key10" : 2,
126 | "struct3" : {
127 | "k1" : 1,
128 | "k2" : 1,
129 | "out" : "1 2"
130 | }
131 | } ]
132 | } ]
133 | }, {
134 | "array2" : [ ]
135 | }, {
136 | "array2" : [ ]
137 | }, {
138 | "array2" : [ ]
139 | }, { } ]
140 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested7Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- array2: array (nullable = true)
3 | | |-- element: struct (containsNull = false)
4 | | | |-- key2: long (nullable = true)
5 | | | |-- inner2: array (nullable = true)
6 | | | | |-- element: struct (containsNull = false)
7 | | | | | |-- key9: long (nullable = true)
8 | | | | | |-- key10: long (nullable = true)
9 | | | | | |-- struct3: struct (nullable = false)
10 | | | | | | |-- k1: integer (nullable = true)
11 | | | | | | |-- k2: integer (nullable = true)
12 | | | | | | |-- out: string (nullable = true)
13 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested8Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "array2" : [ {
3 | "key2" : 1,
4 | "inner2" : [ {
5 | "key9" : 1,
6 | "key10" : 2,
7 | "struct3" : {
8 | "k1" : 1,
9 | "k2" : 2
10 | },
11 | "out" : "2 1"
12 | }, {
13 | "key9" : 1,
14 | "key10" : 2,
15 | "struct3" : {
16 | "k1" : 2,
17 | "k2" : 2
18 | },
19 | "out" : "2 2"
20 | } ]
21 | }, {
22 | "key2" : 2,
23 | "inner2" : [ {
24 | "key9" : 3,
25 | "key10" : 1,
26 | "struct3" : {
27 | "k1" : 1,
28 | "k2" : 2
29 | },
30 | "out" : "1 1"
31 | }, {
32 | "key9" : 2,
33 | "key10" : 2,
34 | "struct3" : {
35 | "k1" : 3,
36 | "k2" : 3
37 | },
38 | "out" : "2 3"
39 | } ]
40 | } ],
41 | "errCol" : [ {
42 | "errType" : "confCastError",
43 | "errCode" : "E00003",
44 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
45 | "errCol" : "k1!==1",
46 | "rawValues" : [ "2" ],
47 | "mappings" : [ ]
48 | }, {
49 | "errType" : "confCastError",
50 | "errCode" : "E00003",
51 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
52 | "errCol" : "k1!==1",
53 | "rawValues" : [ "3" ],
54 | "mappings" : [ ]
55 | } ]
56 | }, {
57 | "array2" : [ {
58 | "key2" : 2,
59 | "inner2" : [ {
60 | "key9" : 1,
61 | "key10" : 2,
62 | "struct3" : {
63 | "k1" : 1,
64 | "k2" : 1
65 | },
66 | "out" : "2 1"
67 | }, {
68 | "key9" : 1,
69 | "key10" : 2,
70 | "struct3" : {
71 | "k1" : 1,
72 | "k2" : 1
73 | },
74 | "out" : "2 1"
75 | } ]
76 | }, {
77 | "key2" : 3,
78 | "inner2" : [ {
79 | "key9" : 3,
80 | "key10" : 1,
81 | "struct3" : {
82 | "k1" : 2,
83 | "k2" : 1
84 | },
85 | "out" : "1 2"
86 | }, {
87 | "key9" : 4,
88 | "key10" : 1,
89 | "struct3" : {
90 | "k1" : 3,
91 | "k2" : 3
92 | },
93 | "out" : "1 3"
94 | } ]
95 | } ],
96 | "errCol" : [ {
97 | "errType" : "confCastError",
98 | "errCode" : "E00003",
99 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
100 | "errCol" : "k1!==1",
101 | "rawValues" : [ "2" ],
102 | "mappings" : [ ]
103 | }, {
104 | "errType" : "confCastError",
105 | "errCode" : "E00003",
106 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
107 | "errCol" : "k1!==1",
108 | "rawValues" : [ "3" ],
109 | "mappings" : [ ]
110 | } ]
111 | }, {
112 | "array2" : [ {
113 | "key2" : 3,
114 | "inner2" : [ {
115 | "key9" : 2,
116 | "key10" : 3,
117 | "struct3" : {
118 | "k1" : 2,
119 | "k2" : 2
120 | },
121 | "out" : "3 2"
122 | }, {
123 | "key9" : 2,
124 | "key10" : 1
125 | } ]
126 | }, {
127 | "key2" : 2,
128 | "inner2" : [ {
129 | "key9" : 3,
130 | "key10" : 1,
131 | "struct3" : {
132 | "k1" : 1,
133 | "k2" : 2
134 | },
135 | "out" : "1 1"
136 | }, {
137 | "key9" : 2,
138 | "key10" : 1,
139 | "struct3" : {
140 | "k1" : 1,
141 | "k2" : 1
142 | },
143 | "out" : "1 1"
144 | } ]
145 | } ],
146 | "errCol" : [ {
147 | "errType" : "confCastError",
148 | "errCode" : "E00003",
149 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
150 | "errCol" : "k1!==1",
151 | "rawValues" : [ "2" ],
152 | "mappings" : [ ]
153 | } ]
154 | }, {
155 | "array2" : [ {
156 | "key2" : 4,
157 | "inner2" : [ ]
158 | }, {
159 | "key2" : 1,
160 | "inner2" : [ {
161 | "key9" : 2,
162 | "key10" : 2,
163 | "struct3" : {
164 | "k1" : 1,
165 | "k2" : 1
166 | },
167 | "out" : "2 1"
168 | } ]
169 | } ],
170 | "errCol" : [ ]
171 | }, {
172 | "array2" : [ ],
173 | "errCol" : [ ]
174 | }, {
175 | "array2" : [ ],
176 | "errCol" : [ ]
177 | }, {
178 | "array2" : [ ],
179 | "errCol" : [ ]
180 | }, {
181 | "errCol" : [ ]
182 | } ]
183 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested8Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- array2: array (nullable = true)
3 | | |-- element: struct (containsNull = false)
4 | | | |-- key2: long (nullable = true)
5 | | | |-- inner2: array (nullable = true)
6 | | | | |-- element: struct (containsNull = false)
7 | | | | | |-- key9: long (nullable = true)
8 | | | | | |-- key10: long (nullable = true)
9 | | | | | |-- struct3: struct (nullable = true)
10 | | | | | | |-- k1: integer (nullable = true)
11 | | | | | | |-- k2: integer (nullable = true)
12 | | | | | |-- out: string (nullable = true)
13 | |-- errCol: array (nullable = true)
14 | | |-- element: struct (containsNull = true)
15 | | | |-- errType: string (nullable = true)
16 | | | |-- errCode: string (nullable = true)
17 | | | |-- errMsg: string (nullable = true)
18 | | | |-- errCol: string (nullable = true)
19 | | | |-- rawValues: array (nullable = true)
20 | | | | |-- element: string (containsNull = true)
21 | | | |-- mappings: array (nullable = true)
22 | | | | |-- element: struct (containsNull = true)
23 | | | | | |-- mappingTableColumn: string (nullable = true)
24 | | | | | |-- mappedDatasetColumn: string (nullable = true)
25 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested9Results.json:
--------------------------------------------------------------------------------
1 | [ {
2 | "array2" : [ {
3 | "key2" : 1,
4 | "inner2" : [ {
5 | "key9" : 1,
6 | "key10" : 2,
7 | "struct3" : {
8 | "k1" : 1,
9 | "k2" : 2
10 | },
11 | "out" : "2 1"
12 | }, {
13 | "key9" : 1,
14 | "key10" : 2,
15 | "struct3" : {
16 | "k1" : 2,
17 | "k2" : 2
18 | },
19 | "out" : "2 2"
20 | } ]
21 | }, {
22 | "key2" : 2,
23 | "inner2" : [ {
24 | "key9" : 3,
25 | "key10" : 1,
26 | "struct3" : {
27 | "k1" : 1,
28 | "k2" : 2
29 | },
30 | "out" : "1 1"
31 | }, {
32 | "key9" : 2,
33 | "key10" : 2,
34 | "struct3" : {
35 | "k1" : 3,
36 | "k2" : 3
37 | },
38 | "out" : "2 3"
39 | } ]
40 | } ],
41 | "errCol" : [ {
42 | "errType" : "Initial",
43 | "errCode" : "000",
44 | "errMsg" : "ErrMsg",
45 | "errCol" : "id",
46 | "rawValues" : [ ],
47 | "mappings" : [ ]
48 | }, {
49 | "errType" : "confCastError",
50 | "errCode" : "E00003",
51 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
52 | "errCol" : "k1!==1",
53 | "rawValues" : [ "2" ],
54 | "mappings" : [ ]
55 | }, {
56 | "errType" : "confCastError",
57 | "errCode" : "E00003",
58 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
59 | "errCol" : "k1!==1",
60 | "rawValues" : [ "3" ],
61 | "mappings" : [ ]
62 | } ]
63 | }, {
64 | "array2" : [ {
65 | "key2" : 2,
66 | "inner2" : [ {
67 | "key9" : 1,
68 | "key10" : 2,
69 | "struct3" : {
70 | "k1" : 1,
71 | "k2" : 1
72 | },
73 | "out" : "2 1"
74 | }, {
75 | "key9" : 1,
76 | "key10" : 2,
77 | "struct3" : {
78 | "k1" : 1,
79 | "k2" : 1
80 | },
81 | "out" : "2 1"
82 | } ]
83 | }, {
84 | "key2" : 3,
85 | "inner2" : [ {
86 | "key9" : 3,
87 | "key10" : 1,
88 | "struct3" : {
89 | "k1" : 2,
90 | "k2" : 1
91 | },
92 | "out" : "1 2"
93 | }, {
94 | "key9" : 4,
95 | "key10" : 1,
96 | "struct3" : {
97 | "k1" : 3,
98 | "k2" : 3
99 | },
100 | "out" : "1 3"
101 | } ]
102 | } ],
103 | "errCol" : [ {
104 | "errType" : "Initial",
105 | "errCode" : "000",
106 | "errMsg" : "ErrMsg",
107 | "errCol" : "id",
108 | "rawValues" : [ ],
109 | "mappings" : [ ]
110 | }, {
111 | "errType" : "confCastError",
112 | "errCode" : "E00003",
113 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
114 | "errCol" : "k1!==1",
115 | "rawValues" : [ "2" ],
116 | "mappings" : [ ]
117 | }, {
118 | "errType" : "confCastError",
119 | "errCode" : "E00003",
120 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
121 | "errCol" : "k1!==1",
122 | "rawValues" : [ "3" ],
123 | "mappings" : [ ]
124 | } ]
125 | }, {
126 | "array2" : [ {
127 | "key2" : 3,
128 | "inner2" : [ {
129 | "key9" : 2,
130 | "key10" : 3,
131 | "struct3" : {
132 | "k1" : 2,
133 | "k2" : 2
134 | },
135 | "out" : "3 2"
136 | }, {
137 | "key9" : 2,
138 | "key10" : 1
139 | } ]
140 | }, {
141 | "key2" : 2,
142 | "inner2" : [ {
143 | "key9" : 3,
144 | "key10" : 1,
145 | "struct3" : {
146 | "k1" : 1,
147 | "k2" : 2
148 | },
149 | "out" : "1 1"
150 | }, {
151 | "key9" : 2,
152 | "key10" : 1,
153 | "struct3" : {
154 | "k1" : 1,
155 | "k2" : 1
156 | },
157 | "out" : "1 1"
158 | } ]
159 | } ],
160 | "errCol" : [ {
161 | "errType" : "Initial",
162 | "errCode" : "000",
163 | "errMsg" : "ErrMsg",
164 | "errCol" : "id",
165 | "rawValues" : [ ],
166 | "mappings" : [ ]
167 | }, {
168 | "errType" : "confCastError",
169 | "errCode" : "E00003",
170 | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
171 | "errCol" : "k1!==1",
172 | "rawValues" : [ "2" ],
173 | "mappings" : [ ]
174 | } ]
175 | }, {
176 | "array2" : [ {
177 | "key2" : 4,
178 | "inner2" : [ ]
179 | }, {
180 | "key2" : 1,
181 | "inner2" : [ {
182 | "key9" : 2,
183 | "key10" : 2,
184 | "struct3" : {
185 | "k1" : 1,
186 | "k2" : 1
187 | },
188 | "out" : "2 1"
189 | } ]
190 | } ],
191 | "errCol" : [ {
192 | "errType" : "Initial",
193 | "errCode" : "000",
194 | "errMsg" : "ErrMsg",
195 | "errCol" : "id",
196 | "rawValues" : [ ],
197 | "mappings" : [ ]
198 | } ]
199 | }, {
200 | "array2" : [ ],
201 | "errCol" : [ {
202 | "errType" : "Initial",
203 | "errCode" : "000",
204 | "errMsg" : "ErrMsg",
205 | "errCol" : "id",
206 | "rawValues" : [ ],
207 | "mappings" : [ ]
208 | } ]
209 | }, {
210 | "array2" : [ ],
211 | "errCol" : [ {
212 | "errType" : "Initial",
213 | "errCode" : "000",
214 | "errMsg" : "ErrMsg",
215 | "errCol" : "id",
216 | "rawValues" : [ ],
217 | "mappings" : [ ]
218 | } ]
219 | }, {
220 | "array2" : [ ],
221 | "errCol" : [ {
222 | "errType" : "Initial",
223 | "errCode" : "000",
224 | "errMsg" : "ErrMsg",
225 | "errCol" : "id",
226 | "rawValues" : [ ],
227 | "mappings" : [ ]
228 | } ]
229 | }, {
230 | "errCol" : [ {
231 | "errType" : "Initial",
232 | "errCode" : "000",
233 | "errMsg" : "ErrMsg",
234 | "errCol" : "id",
235 | "rawValues" : [ ],
236 | "mappings" : [ ]
237 | } ]
238 | } ]
239 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nested9Schema.txt:
--------------------------------------------------------------------------------
1 | root
2 | |-- array2: array (nullable = true)
3 | | |-- element: struct (containsNull = false)
4 | | | |-- key2: long (nullable = true)
5 | | | |-- inner2: array (nullable = true)
6 | | | | |-- element: struct (containsNull = false)
7 | | | | | |-- key9: long (nullable = true)
8 | | | | | |-- key10: long (nullable = true)
9 | | | | | |-- struct3: struct (nullable = true)
10 | | | | | | |-- k1: integer (nullable = true)
11 | | | | | | |-- k2: integer (nullable = true)
12 | | | | | |-- out: string (nullable = true)
13 | |-- errCol: array (nullable = true)
14 | | |-- element: struct (containsNull = true)
15 | | | |-- errType: string (nullable = true)
16 | | | |-- errCode: string (nullable = true)
17 | | | |-- errMsg: string (nullable = true)
18 | | | |-- errCol: string (nullable = true)
19 | | | |-- rawValues: array (nullable = true)
20 | | | | |-- element: string (containsNull = true)
21 | | | |-- mappings: array (nullable = true)
22 | | | | |-- element: struct (containsNull = true)
23 | | | | | |-- mappingTableColumn: string (nullable = true)
24 | | | | | |-- mappedDatasetColumn: string (nullable = true)
25 |
--------------------------------------------------------------------------------
/src/test/resources/test_data/nested/nestedDf1.json:
--------------------------------------------------------------------------------
1 | {"id":1,"key1":1,"key2":2,"struct1":{"key3":3,"key4":1},"struct2":{"inner1":{"key5":3,"key6":1,"skey1":"1"}},"struct3":{"inner3":{"array3":[{"a1":3,"a2":1,"a3":"1"},{"a1":4,"a2":2,"a3":"5"}]}},"array1":[{"key7":2,"key8":3,"skey2":"1"},{"key7":1,"key8":2,"skey2":"2"},{"key7":3,"key8":3,"skey2":"3"}],"array2":[{"key2":1,"inner2":[{"key9":1,"key10":2,"struct3":{"k1":1,"k2":2}},{"key9":1,"key10":2,"struct3":{"k1":2,"k2":2}}]},{"key2":2,"inner2":[{"key9":3,"key10":1,"struct3":{"k1":1,"k2":2}},{"key9":2,"key10":2,"struct3":{"k1":3,"k2":3}}]}]}
2 | {"id":2,"key1":2,"key2":1,"struct1":{"key3":2,"key4":3},"struct2":{"inner1":{"key5":2,"key6":3,"skey1":"2"}},"struct3":{"inner3":{"array3":[{"a1":4,"a2":2,"a3":"3"},{"a1":8,"a2":2,"a3":"5"}]}},"array1":[{"key7":4,"key8":2,"skey2":"2"},{"key7":3,"key8":1,"skey2":"3"},{"key7":3,"key8":3,"skey2":"3"}],"array2":[{"key2":2,"inner2":[{"key9":1,"key10":2,"struct3":{"k1":1,"k2":1}},{"key9":1,"key10":2,"struct3":{"k1":1,"k2":1}}]},{"key2":3,"inner2":[{"key9":3,"key10":1,"struct3":{"k1":2,"k2":1}},{"key9":4,"key10":1,"struct3":{"k1":3,"k2":3}}]}]}
3 | {"id":3,"key1":3,"key2":2,"struct1":{"key3":1,"key4":2},"struct2":{"inner1":{"key5":1,"key6":2,"skey1":"3"}},"struct3":{"inner3":{"array3":[{"a1":5,"a2":3,"a3":"4"},{"a1":8,"a2":4,"a3":"7"}]}},"array1":[],"array2":[{"key2":3,"inner2":[{"key9":2,"key10":3,"struct3":{"k1":2,"k2":2}},{"key9":2,"key10":1}]},{"key2":2,"inner2":[{"key9":3,"key10":1,"struct3":{"k1":1,"k2":2}},{"key9":2,"key10":1,"struct3":{"k1":1,"k2":1}}]}]}
4 | {"id":4,"key1":2,"key2":3,"struct1":{"key3":2,"key4":1},"struct2":{"inner1":{"key5":3,"key6":2,"skey1":"2"}},"struct3":{"inner3":{"array3":[{"a1":6,"a2":4,"a3":"6"},{"a1":9,"a2":3,"a3":"7"}]}},"array1":[],"array2":[{"key2":4,"inner2":[]},{"key2":1,"inner2":[{"key9":2,"key10":2,"struct3":{"k1":1,"k2":1}}]}]}
5 | {"id":5,"key1":4,"key2":1,"struct1":{"key3":3,"key4":3},"struct2":{"inner1":{"key5":2,"key6":1,"skey1":"3"}},"struct3":{"inner3":{"array3":[{"a1":7,"a2":5,"a3":"7"}]}},"array1":[],"array2":[]}
6 | {"id":6,"key1":1,"key2":3,"struct1":{"key3":1,"key4":2},"struct2":{"inner1":{"key5":1,"key6":2,"skey1":"4"}},"struct3":{"inner3":{"array3":[{"a1":4,"a2":6,"a3":"5"}]}},"array1":[],"array2":[]}
7 | {"id":7,"key1":1,"key2":3,"struct1":{"key3":1,"key4":2},"struct2":{"inner1":{"key5":1}},"array1":[],"array2":[]}
8 | {"id":8,"key1":1,"struct1":{"key3":1}}
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/SparkTestBase.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats
18 |
19 | import org.apache.log4j.{Level, Logger}
20 | import org.apache.spark.sql.SparkSession
21 |
22 | trait SparkTestBase {
23 | System.setProperty("user.timezone", "UTC")
24 |
25 | // Do not display INFO entries for tests
26 | Logger.getLogger("org").setLevel(Level.WARN)
27 | Logger.getLogger("akka").setLevel(Level.WARN)
28 |
29 | implicit val spark: SparkSession = SparkSession
30 | .builder()
31 | .master("local[2]")
32 | .appName("test")
33 | .config("spark.ui.enabled", "false")
34 | .config("spark.driver.bindAddress","127.0.0.1")
35 | .config("spark.driver.host", "localhost")
36 | .getOrCreate()
37 | }
38 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/DeepArrayErrorTransformationSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations
18 |
19 | import org.apache.spark.sql.DataFrame
20 | import org.apache.spark.sql.functions._
21 | import org.apache.spark.sql.types.{IntegerType, StringType}
22 | import org.scalatest.funsuite.AnyFunSuite
23 | import org.slf4j.LoggerFactory
24 | import za.co.absa.spark.hats.SparkTestBase
25 | import za.co.absa.spark.hats.transformations.samples.DeepArraySamples._
26 | import za.co.absa.spark.hats.transformations.samples.SampleErrorUDFs
27 | import za.co.absa.spark.hats.utils.JsonUtils
28 |
29 | class DeepArrayErrorTransformationSuite extends AnyFunSuite with SparkTestBase {
30 | // scalastyle:off line.size.limit
31 | // scalastyle:off null
32 |
33 | import spark.implicits._
34 | import za.co.absa.spark.hats.Extensions._
35 | implicit val _: SampleErrorUDFs = new SampleErrorUDFs
36 |
37 | private val log = LoggerFactory.getLogger(this.getClass)
38 |
39 | test("Test casting of a plain field with error column") {
40 | val df = spark.sparkContext.parallelize(plainSampleE).toDF
41 |
42 | val expectedSchema =
43 | """root
44 | | |-- city: string (nullable = true)
45 | | |-- street: string (nullable = true)
46 | | |-- buildingNum: integer (nullable = false)
47 | | |-- zip: string (nullable = true)
48 | | |-- errors: array (nullable = true)
49 | | | |-- element: struct (containsNull = true)
50 | | | | |-- errType: string (nullable = true)
51 | | | | |-- errCode: string (nullable = true)
52 | | | | |-- errMsg: string (nullable = true)
53 | | | | |-- errCol: string (nullable = true)
54 | | | | |-- rawValues: array (nullable = true)
55 | | | | | |-- element: string (containsNull = true)
56 | | | | |-- mappings: array (nullable = true)
57 | | | | | |-- element: struct (containsNull = true)
58 | | | | | | |-- mappingTableColumn: string (nullable = true)
59 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
60 | | |-- intZip: integer (nullable = true)
61 | |""".stripMargin.replace("\r\n", "\n")
62 | val expectedResults =
63 | """[ {
64 | | "city" : "Olomuc",
65 | | "street" : "Vodickova",
66 | | "buildingNum" : 12,
67 | | "zip" : "12000",
68 | | "errors" : [ ],
69 | | "intZip" : 12000
70 | |}, {
71 | | "city" : "Ostrava",
72 | | "street" : "Vlavska",
73 | | "buildingNum" : 110,
74 | | "zip" : "1455a",
75 | | "errors" : [ {
76 | | "errType" : "confCastError",
77 | | "errCode" : "E00003",
78 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
79 | | "errCol" : "intZip",
80 | | "rawValues" : [ "1455a" ],
81 | | "mappings" : [ ]
82 | | } ]
83 | |}, {
84 | | "city" : "Plzen",
85 | | "street" : "Kralova",
86 | | "buildingNum" : 71,
87 | | "zip" : "b881",
88 | | "errors" : [ {
89 | | "errType" : "myErrorType",
90 | | "errCode" : "E-1",
91 | | "errMsg" : "Testing This stuff",
92 | | "errCol" : "whatEvColumn",
93 | | "rawValues" : [ "some value" ],
94 | | "mappings" : [ ]
95 | | }, {
96 | | "errType" : "confCastError",
97 | | "errCode" : "E00003",
98 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
99 | | "errCol" : "intZip",
100 | | "rawValues" : [ "b881" ],
101 | | "mappings" : [ ]
102 | | } ]
103 | |} ]"""
104 | .stripMargin.replace("\r\n", "\n")
105 |
106 | processCastExample(df, "zip", "intZip", expectedSchema, expectedResults)
107 | }
108 |
109 | test("Test casting of a struct of struct field with error column") {
110 | val df = spark.sparkContext.parallelize(structOfStructSampleE).toDF
111 |
112 | val expectedSchema =
113 | """root
114 | | |-- id: integer (nullable = false)
115 | | |-- employee: struct (nullable = false)
116 | | | |-- name: string (nullable = true)
117 | | | |-- address: struct (nullable = false)
118 | | | | |-- city: string (nullable = true)
119 | | | | |-- street: string (nullable = true)
120 | | | | |-- buildingNum: integer (nullable = true)
121 | | | | |-- zip: string (nullable = true)
122 | | | | |-- intZip: integer (nullable = true)
123 | | |-- errors: array (nullable = true)
124 | | | |-- element: struct (containsNull = true)
125 | | | | |-- errType: string (nullable = true)
126 | | | | |-- errCode: string (nullable = true)
127 | | | | |-- errMsg: string (nullable = true)
128 | | | | |-- errCol: string (nullable = true)
129 | | | | |-- rawValues: array (nullable = true)
130 | | | | | |-- element: string (containsNull = true)
131 | | | | |-- mappings: array (nullable = true)
132 | | | | | |-- element: struct (containsNull = true)
133 | | | | | | |-- mappingTableColumn: string (nullable = true)
134 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
135 | |""".stripMargin.replace("\r\n", "\n")
136 | val expectedResults =
137 | """[ {
138 | | "id" : 1,
139 | | "employee" : {
140 | | "name" : "Martin",
141 | | "address" : {
142 | | "city" : "Olomuc",
143 | | "street" : "Vodickova",
144 | | "buildingNum" : 12,
145 | | "zip" : "12000",
146 | | "intZip" : 12000
147 | | }
148 | | },
149 | | "errors" : [ ]
150 | |}, {
151 | | "id" : 1,
152 | | "employee" : {
153 | | "name" : "Petr",
154 | | "address" : {
155 | | "city" : "Ostrava",
156 | | "street" : "Vlavska",
157 | | "buildingNum" : 110,
158 | | "zip" : "1455a"
159 | | }
160 | | },
161 | | "errors" : [ {
162 | | "errType" : "myErrorType",
163 | | "errCode" : "E-1",
164 | | "errMsg" : "Testing This stuff",
165 | | "errCol" : "whatEvColumn",
166 | | "rawValues" : [ "some value" ],
167 | | "mappings" : [ ]
168 | | }, {
169 | | "errType" : "confCastError",
170 | | "errCode" : "E00003",
171 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
172 | | "errCol" : "employee.address.intZip",
173 | | "rawValues" : [ "1455a" ],
174 | | "mappings" : [ ]
175 | | } ]
176 | |}, {
177 | | "id" : 1,
178 | | "employee" : {
179 | | "name" : "Vojta",
180 | | "address" : {
181 | | "city" : "Plzen",
182 | | "street" : "Kralova",
183 | | "buildingNum" : 71,
184 | | "zip" : "b881"
185 | | }
186 | | },
187 | | "errors" : [ {
188 | | "errType" : "confCastError",
189 | | "errCode" : "E00003",
190 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
191 | | "errCol" : "employee.address.intZip",
192 | | "rawValues" : [ "b881" ],
193 | | "mappings" : [ ]
194 | | } ]
195 | |} ]"""
196 | .stripMargin.replace("\r\n", "\n")
197 |
198 | processCastExample(df, "employee.address.zip", "employee.address.intZip",
199 | expectedSchema, expectedResults)
200 | }
201 |
202 | test("Test casting of an array of struct of struct with error column") {
203 | val df = spark.sparkContext.parallelize(arrayOfStructOfStructErrSampleE).toDF
204 |
205 | val expectedSchema =
206 | """root
207 | | |-- id: integer (nullable = false)
208 | | |-- employee: array (nullable = true)
209 | | | |-- element: struct (containsNull = false)
210 | | | | |-- name: string (nullable = true)
211 | | | | |-- address: struct (nullable = false)
212 | | | | | |-- city: string (nullable = true)
213 | | | | | |-- street: string (nullable = true)
214 | | | | | |-- buildingNum: integer (nullable = true)
215 | | | | | |-- zip: string (nullable = true)
216 | | | | | |-- intZip: integer (nullable = true)
217 | | |-- errors: array (nullable = true)
218 | | | |-- element: struct (containsNull = true)
219 | | | | |-- errType: string (nullable = true)
220 | | | | |-- errCode: string (nullable = true)
221 | | | | |-- errMsg: string (nullable = true)
222 | | | | |-- errCol: string (nullable = true)
223 | | | | |-- rawValues: array (nullable = true)
224 | | | | | |-- element: string (containsNull = true)
225 | | | | |-- mappings: array (nullable = true)
226 | | | | | |-- element: struct (containsNull = true)
227 | | | | | | |-- mappingTableColumn: string (nullable = true)
228 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
229 | |""".stripMargin.replace("\r\n", "\n")
230 | val expectedResults =
231 | """[ {
232 | | "id" : 1,
233 | | "employee" : [ {
234 | | "name" : "Martin",
235 | | "address" : {
236 | | "city" : "Olomuc",
237 | | "street" : "Vodickova",
238 | | "buildingNum" : 732,
239 | | "zip" : "73200",
240 | | "intZip" : 73200
241 | | }
242 | | }, {
243 | | "name" : "Stephan",
244 | | "address" : {
245 | | "city" : "Olomuc",
246 | | "street" : "Vodickova",
247 | | "buildingNum" : 77,
248 | | "zip" : "77-333"
249 | | }
250 | | } ],
251 | | "errors" : [ {
252 | | "errType" : "confCastError",
253 | | "errCode" : "E00003",
254 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
255 | | "errCol" : "employee.address.intZip",
256 | | "rawValues" : [ "77-333" ],
257 | | "mappings" : [ ]
258 | | } ]
259 | |}, {
260 | | "id" : 2,
261 | | "employee" : [ {
262 | | "name" : "Petr",
263 | | "address" : {
264 | | "city" : "Ostrava",
265 | | "street" : "Vlavska",
266 | | "buildingNum" : 25,
267 | | "zip" : "a9991"
268 | | }
269 | | }, {
270 | | "name" : "Michal",
271 | | "address" : {
272 | | "city" : "Ostrava",
273 | | "street" : "Vlavska",
274 | | "buildingNum" : 334,
275 | | "zip" : "552-aa1"
276 | | }
277 | | } ],
278 | | "errors" : [ {
279 | | "errType" : "myErrorType",
280 | | "errCode" : "E-1",
281 | | "errMsg" : "Testing This stuff",
282 | | "errCol" : "whatEvColumn",
283 | | "rawValues" : [ "some value" ],
284 | | "mappings" : [ ]
285 | | }, {
286 | | "errType" : "confCastError",
287 | | "errCode" : "E00003",
288 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
289 | | "errCol" : "employee.address.intZip",
290 | | "rawValues" : [ "a9991" ],
291 | | "mappings" : [ ]
292 | | }, {
293 | | "errType" : "confCastError",
294 | | "errCode" : "E00003",
295 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
296 | | "errCol" : "employee.address.intZip",
297 | | "rawValues" : [ "552-aa1" ],
298 | | "mappings" : [ ]
299 | | } ]
300 | |}, {
301 | | "id" : 3,
302 | | "employee" : [ {
303 | | "name" : "Vojta",
304 | | "address" : {
305 | | "city" : "Plzen",
306 | | "street" : "Kralova",
307 | | "buildingNum" : 33,
308 | | "zip" : "993",
309 | | "intZip" : 993
310 | | }
311 | | } ],
312 | | "errors" : [ ]
313 | |} ]"""
314 | .stripMargin.replace("\r\n", "\n")
315 |
316 | processCastExample(df, "employee.address.zip", "employee.address.intZip",
317 | expectedSchema, expectedResults)
318 | }
319 |
320 | test("Test casting of an array of primitives") {
321 | val df = spark.sparkContext.parallelize(arraysOfPrimitivesSampleE).toDF
322 |
323 | val expectedSchema =
324 | """root
325 | | |-- id: integer (nullable = false)
326 | | |-- nums: array (nullable = true)
327 | | | |-- element: string (containsNull = true)
328 | | |-- intNums: array (nullable = true)
329 | | | |-- element: integer (containsNull = true)
330 | | |-- errors: array (nullable = true)
331 | | | |-- element: struct (containsNull = true)
332 | | | | |-- errType: string (nullable = true)
333 | | | | |-- errCode: string (nullable = true)
334 | | | | |-- errMsg: string (nullable = true)
335 | | | | |-- errCol: string (nullable = true)
336 | | | | |-- rawValues: array (nullable = true)
337 | | | | | |-- element: string (containsNull = true)
338 | | | | |-- mappings: array (nullable = true)
339 | | | | | |-- element: struct (containsNull = true)
340 | | | | | | |-- mappingTableColumn: string (nullable = true)
341 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
342 | |""".stripMargin.replace("\r\n", "\n")
343 | val expectedResults =
344 | """[ {
345 | | "id" : 1,
346 | | "nums" : [ "7755", "a212", "222-111" ],
347 | | "intNums" : [ 7755, null, null ],
348 | | "errors" : [ {
349 | | "errType" : "myErrorType",
350 | | "errCode" : "E-1",
351 | | "errMsg" : "Testing This stuff",
352 | | "errCol" : "whatEvColumn",
353 | | "rawValues" : [ "some value" ],
354 | | "mappings" : [ ]
355 | | }, {
356 | | "errType" : "confCastError",
357 | | "errCode" : "E00003",
358 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
359 | | "errCol" : "intNums",
360 | | "rawValues" : [ "a212" ],
361 | | "mappings" : [ ]
362 | | }, {
363 | | "errType" : "confCastError",
364 | | "errCode" : "E00003",
365 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
366 | | "errCol" : "intNums",
367 | | "rawValues" : [ "222-111" ],
368 | | "mappings" : [ ]
369 | | } ]
370 | |}, {
371 | | "id" : 1,
372 | | "nums" : [ "223a", "223a", "775" ],
373 | | "intNums" : [ null, null, 775 ],
374 | | "errors" : [ {
375 | | "errType" : "confCastError",
376 | | "errCode" : "E00003",
377 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
378 | | "errCol" : "intNums",
379 | | "rawValues" : [ "223a" ],
380 | | "mappings" : [ ]
381 | | } ]
382 | |}, {
383 | | "id" : 1,
384 | | "nums" : [ "5", "-100", "9999999" ],
385 | | "intNums" : [ 5, -100, 9999999 ],
386 | | "errors" : [ ]
387 | |} ]"""
388 | .stripMargin.replace("\r\n", "\n")
389 |
390 | processCastExample(df, "nums", "intNums", expectedSchema, expectedResults)
391 | }
392 |
393 | test("Test casting of an array of array of primitives") {
394 | val df = spark.sparkContext.parallelize(arraysOfArraysOfPrimitivesSampleE).toDF
395 |
396 | val expectedSchema =
397 | """root
398 | | |-- id: integer (nullable = false)
399 | | |-- matrix: array (nullable = true)
400 | | | |-- element: array (containsNull = true)
401 | | | | |-- element: string (containsNull = true)
402 | | |-- intMatrix: array (nullable = true)
403 | | | |-- element: array (containsNull = true)
404 | | | | |-- element: integer (containsNull = true)
405 | | |-- errors: array (nullable = true)
406 | | | |-- element: struct (containsNull = true)
407 | | | | |-- errType: string (nullable = true)
408 | | | | |-- errCode: string (nullable = true)
409 | | | | |-- errMsg: string (nullable = true)
410 | | | | |-- errCol: string (nullable = true)
411 | | | | |-- rawValues: array (nullable = true)
412 | | | | | |-- element: string (containsNull = true)
413 | | | | |-- mappings: array (nullable = true)
414 | | | | | |-- element: struct (containsNull = true)
415 | | | | | | |-- mappingTableColumn: string (nullable = true)
416 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
417 | |""".stripMargin.replace("\r\n", "\n")
418 | val expectedResults =
419 | """[ {
420 | | "id" : 1,
421 | | "matrix" : [ [ "10", "11b" ], [ "11b", "12" ] ],
422 | | "intMatrix" : [ [ 10, null ], [ null, 12 ] ],
423 | | "errors" : [ {
424 | | "errType" : "myErrorType",
425 | | "errCode" : "E-1",
426 | | "errMsg" : "Testing This stuff",
427 | | "errCol" : "whatEvColumn",
428 | | "rawValues" : [ "some value" ],
429 | | "mappings" : [ ]
430 | | }, {
431 | | "errType" : "confCastError",
432 | | "errCode" : "E00003",
433 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
434 | | "errCol" : "intMatrix",
435 | | "rawValues" : [ "11b" ],
436 | | "mappings" : [ ]
437 | | } ]
438 | |}, {
439 | | "id" : 2,
440 | | "matrix" : [ [ "20f", "300" ], [ "1000", "10-10" ] ],
441 | | "intMatrix" : [ [ null, 300 ], [ 1000, null ] ],
442 | | "errors" : [ {
443 | | "errType" : "confCastError",
444 | | "errCode" : "E00003",
445 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
446 | | "errCol" : "intMatrix",
447 | | "rawValues" : [ "20f" ],
448 | | "mappings" : [ ]
449 | | }, {
450 | | "errType" : "confCastError",
451 | | "errCode" : "E00003",
452 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
453 | | "errCol" : "intMatrix",
454 | | "rawValues" : [ "10-10" ],
455 | | "mappings" : [ ]
456 | | } ]
457 | |}, {
458 | | "id" : 3,
459 | | "matrix" : [ [ "775", "223" ], [ "100", "0" ] ],
460 | | "intMatrix" : [ [ 775, 223 ], [ 100, 0 ] ],
461 | | "errors" : [ ]
462 | |} ]"""
463 | .stripMargin.replace("\r\n", "\n")
464 |
465 | processCastExample(df, "matrix", "intMatrix", expectedSchema, expectedResults)
466 | }
467 |
468 | test("Test casting of an array of struct of array of struct with error column") {
469 | val df = spark.sparkContext.parallelize(arraysOfStrtuctsDeepSampleE).toDF
470 |
471 | val expectedSchema =
472 | """root
473 | | |-- id: integer (nullable = false)
474 | | |-- legs: array (nullable = true)
475 | | | |-- element: struct (containsNull = false)
476 | | | | |-- legid: integer (nullable = true)
477 | | | | |-- conditions: array (nullable = true)
478 | | | | | |-- element: struct (containsNull = false)
479 | | | | | | |-- conif: string (nullable = true)
480 | | | | | | |-- conthen: string (nullable = true)
481 | | | | | | |-- amount: double (nullable = true)
482 | | | | | | |-- intConditionVal: integer (nullable = true)
483 | | |-- errors: array (nullable = true)
484 | | | |-- element: struct (containsNull = true)
485 | | | | |-- errType: string (nullable = true)
486 | | | | |-- errCode: string (nullable = true)
487 | | | | |-- errMsg: string (nullable = true)
488 | | | | |-- errCol: string (nullable = true)
489 | | | | |-- rawValues: array (nullable = true)
490 | | | | | |-- element: string (containsNull = true)
491 | | | | |-- mappings: array (nullable = true)
492 | | | | | |-- element: struct (containsNull = true)
493 | | | | | | |-- mappingTableColumn: string (nullable = true)
494 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
495 | |""".stripMargin.replace("\r\n", "\n")
496 | val expectedResults =
497 | """[ {
498 | | "id" : 1,
499 | | "legs" : [ {
500 | | "legid" : 100,
501 | | "conditions" : [ {
502 | | "conif" : "if bid>10",
503 | | "conthen" : "100",
504 | | "amount" : 100.0,
505 | | "intConditionVal" : 100
506 | | }, {
507 | | "conif" : "if sell<5",
508 | | "conthen" : "300a",
509 | | "amount" : 150.0
510 | | }, {
511 | | "conif" : "if sell<1",
512 | | "conthen" : "1000",
513 | | "amount" : 1000.0,
514 | | "intConditionVal" : 1000
515 | | } ]
516 | | }, {
517 | | "legid" : 101,
518 | | "conditions" : [ {
519 | | "conif" : "if bid<50",
520 | | "conthen" : "200",
521 | | "amount" : 200.0,
522 | | "intConditionVal" : 200
523 | | }, {
524 | | "conif" : "if sell>30",
525 | | "conthen" : "175b",
526 | | "amount" : 175.0
527 | | }, {
528 | | "conif" : "if sell>25",
529 | | "conthen" : "225-225",
530 | | "amount" : 225.0
531 | | } ]
532 | | } ],
533 | | "errors" : [ {
534 | | "errType" : "confCastError",
535 | | "errCode" : "E00003",
536 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
537 | | "errCol" : "legs.conditions.intConditionVal",
538 | | "rawValues" : [ "300a" ],
539 | | "mappings" : [ ]
540 | | }, {
541 | | "errType" : "confCastError",
542 | | "errCode" : "E00003",
543 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
544 | | "errCol" : "legs.conditions.intConditionVal",
545 | | "rawValues" : [ "175b" ],
546 | | "mappings" : [ ]
547 | | }, {
548 | | "errType" : "confCastError",
549 | | "errCode" : "E00003",
550 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
551 | | "errCol" : "legs.conditions.intConditionVal",
552 | | "rawValues" : [ "225-225" ],
553 | | "mappings" : [ ]
554 | | } ]
555 | |}, {
556 | | "id" : 2,
557 | | "legs" : [ {
558 | | "legid" : 102,
559 | | "conditions" : [ {
560 | | "conif" : "if bid>11",
561 | | "conthen" : "100",
562 | | "amount" : 100.0,
563 | | "intConditionVal" : 100
564 | | }, {
565 | | "conif" : "if sell<6",
566 | | "conthen" : "150",
567 | | "amount" : 150.0,
568 | | "intConditionVal" : 150
569 | | }, {
570 | | "conif" : "if sell<2",
571 | | "conthen" : "1000",
572 | | "amount" : 1000.0,
573 | | "intConditionVal" : 1000
574 | | } ]
575 | | }, {
576 | | "legid" : 103,
577 | | "conditions" : [ {
578 | | "conif" : "if bid<51",
579 | | "conthen" : "200",
580 | | "amount" : 200.0,
581 | | "intConditionVal" : 200
582 | | }, {
583 | | "conif" : "if sell>31",
584 | | "conthen" : "175",
585 | | "amount" : 175.0,
586 | | "intConditionVal" : 175
587 | | }, {
588 | | "conif" : "if sell>26",
589 | | "conthen" : "225",
590 | | "amount" : 225.0,
591 | | "intConditionVal" : 225
592 | | } ]
593 | | } ],
594 | | "errors" : [ ]
595 | |}, {
596 | | "id" : 3,
597 | | "legs" : [ {
598 | | "legid" : 104,
599 | | "conditions" : [ {
600 | | "conif" : "if bid>12",
601 | | "conthen" : "1OO",
602 | | "amount" : 100.0
603 | | }, {
604 | | "conif" : "if sell<7",
605 | | "conthen" : "150x",
606 | | "amount" : 150.0
607 | | }, {
608 | | "conif" : "if sell<3",
609 | | "conthen" : "-1000-",
610 | | "amount" : 1000.0
611 | | } ]
612 | | }, {
613 | | "legid" : 105,
614 | | "conditions" : [ {
615 | | "conif" : "if bid<52",
616 | | "conthen" : "2OO",
617 | | "amount" : 200.0
618 | | }, {
619 | | "conif" : "if sell>32",
620 | | "conthen" : "f175",
621 | | "amount" : 175.0
622 | | }, {
623 | | "conif" : "if sell>27",
624 | | "conthen" : "225_",
625 | | "amount" : 225.0
626 | | } ]
627 | | } ],
628 | | "errors" : [ {
629 | | "errType" : "myErrorType",
630 | | "errCode" : "E-1",
631 | | "errMsg" : "Testing This stuff",
632 | | "errCol" : "whatEvColumn",
633 | | "rawValues" : [ "some value" ],
634 | | "mappings" : [ ]
635 | | }, {
636 | | "errType" : "confCastError",
637 | | "errCode" : "E00003",
638 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
639 | | "errCol" : "legs.conditions.intConditionVal",
640 | | "rawValues" : [ "1OO" ],
641 | | "mappings" : [ ]
642 | | }, {
643 | | "errType" : "confCastError",
644 | | "errCode" : "E00003",
645 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
646 | | "errCol" : "legs.conditions.intConditionVal",
647 | | "rawValues" : [ "150x" ],
648 | | "mappings" : [ ]
649 | | }, {
650 | | "errType" : "confCastError",
651 | | "errCode" : "E00003",
652 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
653 | | "errCol" : "legs.conditions.intConditionVal",
654 | | "rawValues" : [ "-1000-" ],
655 | | "mappings" : [ ]
656 | | }, {
657 | | "errType" : "confCastError",
658 | | "errCode" : "E00003",
659 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
660 | | "errCol" : "legs.conditions.intConditionVal",
661 | | "rawValues" : [ "2OO" ],
662 | | "mappings" : [ ]
663 | | }, {
664 | | "errType" : "confCastError",
665 | | "errCode" : "E00003",
666 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
667 | | "errCol" : "legs.conditions.intConditionVal",
668 | | "rawValues" : [ "f175" ],
669 | | "mappings" : [ ]
670 | | }, {
671 | | "errType" : "confCastError",
672 | | "errCode" : "E00003",
673 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
674 | | "errCol" : "legs.conditions.intConditionVal",
675 | | "rawValues" : [ "225_" ],
676 | | "mappings" : [ ]
677 | | } ]
678 | |} ]"""
679 | .stripMargin.replace("\r\n", "\n")
680 |
681 | processCastExample(df, "legs.conditions.conthen", "legs.conditions.intConditionVal",
682 | expectedSchema, expectedResults)
683 | }
684 |
685 | test("Test casting of an array of struct of struct WITHOUT error column") {
686 | val df = spark.sparkContext.parallelize(arrayOfStructOfStruvtNoErrSampleE).toDF
687 |
688 | val expectedSchema =
689 | """root
690 | | |-- id: integer (nullable = false)
691 | | |-- employee: array (nullable = true)
692 | | | |-- element: struct (containsNull = false)
693 | | | | |-- name: string (nullable = true)
694 | | | | |-- address: struct (nullable = false)
695 | | | | | |-- city: string (nullable = true)
696 | | | | | |-- street: string (nullable = true)
697 | | | | | |-- buildingNum: integer (nullable = true)
698 | | | | | |-- zip: string (nullable = true)
699 | | | | | |-- intZip: integer (nullable = true)
700 | | |-- errors: array (nullable = true)
701 | | | |-- element: struct (containsNull = true)
702 | | | | |-- errType: string (nullable = true)
703 | | | | |-- errCode: string (nullable = true)
704 | | | | |-- errMsg: string (nullable = true)
705 | | | | |-- errCol: string (nullable = true)
706 | | | | |-- rawValues: array (nullable = true)
707 | | | | | |-- element: string (containsNull = true)
708 | | | | |-- mappings: array (nullable = true)
709 | | | | | |-- element: struct (containsNull = true)
710 | | | | | | |-- mappingTableColumn: string (nullable = true)
711 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
712 | |""".stripMargin.replace("\r\n", "\n")
713 | val expectedResults =
714 | """[ {
715 | | "id" : 1,
716 | | "employee" : [ {
717 | | "name" : "Martin",
718 | | "address" : {
719 | | "city" : "Olomuc",
720 | | "street" : "Vodickova",
721 | | "buildingNum" : 732,
722 | | "zip" : "73200",
723 | | "intZip" : 73200
724 | | }
725 | | }, {
726 | | "name" : "Stephan",
727 | | "address" : {
728 | | "city" : "Olomuc",
729 | | "street" : "Vodickova",
730 | | "buildingNum" : 77,
731 | | "zip" : "77-333"
732 | | }
733 | | } ],
734 | | "errors" : [ {
735 | | "errType" : "confCastError",
736 | | "errCode" : "E00003",
737 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
738 | | "errCol" : "employee.address.intZip",
739 | | "rawValues" : [ "77-333" ],
740 | | "mappings" : [ ]
741 | | } ]
742 | |}, {
743 | | "id" : 2,
744 | | "employee" : [ {
745 | | "name" : "Petr",
746 | | "address" : {
747 | | "city" : "Ostrava",
748 | | "street" : "Vlavska",
749 | | "buildingNum" : 25,
750 | | "zip" : "a9991"
751 | | }
752 | | }, {
753 | | "name" : "Michal",
754 | | "address" : {
755 | | "city" : "Ostrava",
756 | | "street" : "Vlavska",
757 | | "buildingNum" : 334,
758 | | "zip" : "552-aa1"
759 | | }
760 | | } ],
761 | | "errors" : [ {
762 | | "errType" : "confCastError",
763 | | "errCode" : "E00003",
764 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
765 | | "errCol" : "employee.address.intZip",
766 | | "rawValues" : [ "a9991" ],
767 | | "mappings" : [ ]
768 | | }, {
769 | | "errType" : "confCastError",
770 | | "errCode" : "E00003",
771 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
772 | | "errCol" : "employee.address.intZip",
773 | | "rawValues" : [ "552-aa1" ],
774 | | "mappings" : [ ]
775 | | } ]
776 | |}, {
777 | | "id" : 3,
778 | | "employee" : [ {
779 | | "name" : "Vojta",
780 | | "address" : {
781 | | "city" : "Plzen",
782 | | "street" : "Kralova",
783 | | "buildingNum" : 33,
784 | | "zip" : "993",
785 | | "intZip" : 993
786 | | }
787 | | } ],
788 | | "errors" : [ ]
789 | |} ]"""
790 | .stripMargin.replace("\r\n", "\n")
791 |
792 | processCastExample(df, "employee.address.zip", "employee.address.intZip",
793 | expectedSchema, expectedResults)
794 | }
795 |
796 | test ("Test multiple levels of nesting") {
797 |
798 | val sample = """[{"id":1,"legs":[{"legid":100,"conditions":[{"checks":[{"checkNums":["1","2","3b","4","5c","6"]}],"amount":100}]}]}]"""
799 |
800 | val df = JsonUtils.getDataFrameFromJson(spark, Seq(sample))
801 |
802 | val expectedSchema =
803 | """root
804 | | |-- id: long (nullable = true)
805 | | |-- legs: array (nullable = true)
806 | | | |-- element: struct (containsNull = false)
807 | | | | |-- conditions: array (nullable = true)
808 | | | | | |-- element: struct (containsNull = false)
809 | | | | | | |-- amount: long (nullable = true)
810 | | | | | | |-- checks: array (nullable = true)
811 | | | | | | | |-- element: struct (containsNull = false)
812 | | | | | | | | |-- checkNums: array (nullable = true)
813 | | | | | | | | | |-- element: string (containsNull = true)
814 | | | | | | | | |-- optimizedNums: array (nullable = true)
815 | | | | | | | | | |-- element: integer (containsNull = true)
816 | | | | |-- legid: long (nullable = true)
817 | | |-- errors: array (nullable = true)
818 | | | |-- element: struct (containsNull = true)
819 | | | | |-- errType: string (nullable = true)
820 | | | | |-- errCode: string (nullable = true)
821 | | | | |-- errMsg: string (nullable = true)
822 | | | | |-- errCol: string (nullable = true)
823 | | | | |-- rawValues: array (nullable = true)
824 | | | | | |-- element: string (containsNull = true)
825 | | | | |-- mappings: array (nullable = true)
826 | | | | | |-- element: struct (containsNull = true)
827 | | | | | | |-- mappingTableColumn: string (nullable = true)
828 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
829 | |""".stripMargin.replace("\r\n", "\n")
830 | val expectedResults =
831 | """[ {
832 | | "id" : 1,
833 | | "legs" : [ {
834 | | "conditions" : [ {
835 | | "amount" : 100,
836 | | "checks" : [ {
837 | | "checkNums" : [ "1", "2", "3b", "4", "5c", "6" ],
838 | | "optimizedNums" : [ 1, 2, null, 4, null, 6 ]
839 | | } ]
840 | | } ],
841 | | "legid" : 100
842 | | } ],
843 | | "errors" : [ {
844 | | "errType" : "confCastError",
845 | | "errCode" : "E00003",
846 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
847 | | "errCol" : "legs.conditions.checks.optimizedNums",
848 | | "rawValues" : [ "3b" ],
849 | | "mappings" : [ ]
850 | | }, {
851 | | "errType" : "confCastError",
852 | | "errCode" : "E00003",
853 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
854 | | "errCol" : "legs.conditions.checks.optimizedNums",
855 | | "rawValues" : [ "5c" ],
856 | | "mappings" : [ ]
857 | | } ]
858 | |} ]"""
859 | .stripMargin.replace("\r\n", "\n")
860 |
861 | processCastExample(df, "legs.conditions.checks.checkNums", "legs.conditions.checks.optimizedNums",
862 | expectedSchema, expectedResults)
863 | }
864 |
865 | test ("Test combining fields on multiple levels of nesting") {
866 |
867 | val sample = """[{"id":1,"legs":[{"legid":100,"conditions":[{"checks":[{"checkNums":["1","2","3b","4","5c","6"]}],"amount":100}]}]}]"""
868 |
869 | val df = JsonUtils.getDataFrameFromJson(spark, Seq(sample))
870 |
871 | val expectedSchema =
872 | """root
873 | | |-- id: long (nullable = true)
874 | | |-- legs: array (nullable = true)
875 | | | |-- element: struct (containsNull = false)
876 | | | | |-- conditions: array (nullable = true)
877 | | | | | |-- element: struct (containsNull = false)
878 | | | | | | |-- amount: long (nullable = true)
879 | | | | | | |-- checks: array (nullable = true)
880 | | | | | | | |-- element: struct (containsNull = false)
881 | | | | | | | | |-- checkNums: array (nullable = true)
882 | | | | | | | | | |-- element: string (containsNull = true)
883 | | | | | | | | |-- optimizedNums: array (nullable = true)
884 | | | | | | | | | |-- element: string (containsNull = true)
885 | | | | |-- legid: long (nullable = true)
886 | | |-- errors: array (nullable = true)
887 | | | |-- element: struct (containsNull = true)
888 | | | | |-- errType: string (nullable = true)
889 | | | | |-- errCode: string (nullable = true)
890 | | | | |-- errMsg: string (nullable = true)
891 | | | | |-- errCol: string (nullable = true)
892 | | | | |-- rawValues: array (nullable = true)
893 | | | | | |-- element: string (containsNull = true)
894 | | | | |-- mappings: array (nullable = true)
895 | | | | | |-- element: struct (containsNull = true)
896 | | | | | | |-- mappingTableColumn: string (nullable = true)
897 | | | | | | |-- mappedDatasetColumn: string (nullable = true)
898 | |""".stripMargin.replace("\r\n", "\n")
899 |
900 | val expectedResults =
901 | """[ {
902 | | "id" : 1,
903 | | "legs" : [ {
904 | | "conditions" : [ {
905 | | "amount" : 100,
906 | | "checks" : [ {
907 | | "checkNums" : [ "1", "2", "3b", "4", "5c", "6" ],
908 | | "optimizedNums" : [ "1_100_1", "2_100_1", "3b_100_1", "4_100_1", "5c_100_1", "6_100_1" ]
909 | | } ]
910 | | } ],
911 | | "legid" : 100
912 | | } ],
913 | | "errors" : [ {
914 | | "errType" : "confCastError",
915 | | "errCode" : "E00003",
916 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
917 | | "errCol" : "legs.conditions.checks.optimizedNums",
918 | | "rawValues" : [ "3b" ],
919 | | "mappings" : [ ]
920 | | }, {
921 | | "errType" : "confCastError",
922 | | "errCode" : "E00003",
923 | | "errMsg" : "Conformance Error - Null returned by casting conformance rule",
924 | | "errCol" : "legs.conditions.checks.optimizedNums",
925 | | "rawValues" : [ "5c" ],
926 | | "mappings" : [ ]
927 | | } ]
928 | |} ]"""
929 | .stripMargin.replace("\r\n", "\n")
930 |
931 | val inputColumn = "legs.conditions.checks.checkNums"
932 | val outputColumn = "legs.conditions.checks.optimizedNums"
933 | val dfOut = NestedArrayTransformations.nestedExtendedWithColumnAndErrorMap(df, inputColumn, outputColumn, "errors",
934 | (_, gf) => {
935 | concat(gf(inputColumn),
936 | lit("_"),
937 | gf("legs.conditions.amount").cast(StringType),
938 | lit("_"),
939 | gf("id"))
940 | }, (c, gf) => {
941 | when(c.isNotNull.and(c.cast(IntegerType).isNull)
942 | .and(gf("legs.conditions.amount") === 100)
943 | .and(gf("legs.legid") === 100)
944 | .and(gf("id") === 1),
945 | callUDF("confCastErr", lit(outputColumn), gf(inputColumn).cast(StringType)))
946 | .otherwise(null)
947 | })
948 |
949 | val actualSchema = dfOut.schema.treeString
950 | val actualResults = JsonUtils.prettySparkJSON(dfOut.toJSON.collect)
951 |
952 | assertSchema(actualSchema, expectedSchema)
953 | assertResults(actualResults, expectedResults)
954 | }
955 |
956 | test ("Test deep array transformations unhappy paths") {
957 | val df = spark.sparkContext.parallelize(Seq(1,2,3,4,5)).toDF()
958 |
959 | assert(intercept[IllegalArgumentException] {
960 | NestedArrayTransformations.nestedWithColumnAndErrorMap(df, "value", "value2", "err.errors", c => c, e => e)
961 | }.getMessage contains "Error columns should be at the root schema level")
962 |
963 | assert(intercept[IllegalArgumentException] {
964 | NestedArrayTransformations.nestedWithColumnMap(df, "value.foo", "value.foo2", c => c)
965 | }.getMessage contains "Field 'value' is not a struct type or an array")
966 |
967 | assert(intercept[IllegalArgumentException] {
968 | NestedArrayTransformations.nestedWithColumnMap(df, "value", "", _ => lit("foo")).printSchema()
969 | }.getMessage contains "Output field cannot be empty")
970 |
971 | assert(intercept[IllegalArgumentException] {
972 | df.nestedWithColumn("value", lit("foo")).printSchema()
973 | }.getMessage contains "The column 'value' already exists")
974 |
975 | }
976 |
977 | test("Test array_distinct() from Spark API (didn't work in 2.4.0, fixed in 2.4.1)"){
978 | val sourceData =
979 | """{
980 | | "id": 3,
981 | | "MyLiteral": "abcdef",
982 | | "MyUpperLiteral": "ABCDEF",
983 | | "errCol": [
984 | | {
985 | | "errType": "confMapError",
986 | | "errCode": "E00001",
987 | | "errMsg": "Conformance Error - Null produced by mapping conformance rule",
988 | | "errCol": "legs.conditions.conformed_country",
989 | | "rawValues": [
990 | | "SWE"
991 | | ],
992 | | "mappings": [
993 | | {
994 | | "mappingTableColumn": "country_code",
995 | | "mappedDatasetColumn": "legs.conditions.country"
996 | | }
997 | | ]
998 | | },
999 | | {
1000 | | "errType": "confMapError",
1001 | | "errCode": "E00001",
1002 | | "errMsg": "Conformance Error - Null produced by mapping conformance rule",
1003 | | "errCol": "legs.conditions.conformed_country",
1004 | | "rawValues": [
1005 | | "SWE"
1006 | | ],
1007 | | "mappings": [
1008 | | {
1009 | | "mappingTableColumn": "country_code",
1010 | | "mappedDatasetColumn": "legs.conditions.country"
1011 | | }
1012 | | ]
1013 | | },
1014 | | {
1015 | | "errType": "confMapError",
1016 | | "errCode": "E00001",
1017 | | "errMsg": "Conformance Error - Null produced by mapping conformance rule",
1018 | | "errCol": "legs.conditions.conformed_currency",
1019 | | "rawValues": [
1020 | | "Dummy"
1021 | | ],
1022 | | "mappings": [
1023 | | {
1024 | | "mappingTableColumn": "currency_code",
1025 | | "mappedDatasetColumn": "legs.conditions.currency"
1026 | | }
1027 | | ]
1028 | | }
1029 | | ],
1030 | | "legs": [
1031 | | {
1032 | | "conditions": [
1033 | | {
1034 | | "checks": [],
1035 | | "country": "SWE",
1036 | | "currency": "SWK",
1037 | | "product": "Stock",
1038 | | "conformed_currency": "SEK",
1039 | | "conformed_product": "STK"
1040 | | }
1041 | | ],
1042 | | "legid": 300
1043 | | },
1044 | | {
1045 | | "conditions": [
1046 | | {
1047 | | "checks": [],
1048 | | "country": "SA",
1049 | | "currency": "Dummy",
1050 | | "product": "Bond",
1051 | | "conformed_country": "South Africa",
1052 | | "conformed_currency": "Unknown",
1053 | | "conformed_product": "BND"
1054 | | }
1055 | | ],
1056 | | "legid": 301
1057 | | }
1058 | | ]
1059 | |}""".stripMargin
1060 |
1061 | val expectedDistinct =
1062 | """{
1063 | | "MyLiteral" : "abcdef",
1064 | | "errCol" : [ {
1065 | | "errCode" : "E00001",
1066 | | "errCol" : "legs.conditions.conformed_country",
1067 | | "errMsg" : "Conformance Error - Null produced by mapping conformance rule",
1068 | | "errType" : "confMapError",
1069 | | "mappings" : [ {
1070 | | "mappedDatasetColumn" : "legs.conditions.country",
1071 | | "mappingTableColumn" : "country_code"
1072 | | } ],
1073 | | "rawValues" : [ "SWE" ]
1074 | | }, {
1075 | | "errCode" : "E00001",
1076 | | "errCol" : "legs.conditions.conformed_currency",
1077 | | "errMsg" : "Conformance Error - Null produced by mapping conformance rule",
1078 | | "errType" : "confMapError",
1079 | | "mappings" : [ {
1080 | | "mappedDatasetColumn" : "legs.conditions.currency",
1081 | | "mappingTableColumn" : "currency_code"
1082 | | } ],
1083 | | "rawValues" : [ "Dummy" ]
1084 | | } ]
1085 | |}""".stripMargin.replace("\r\n", "\n")
1086 |
1087 | val df = JsonUtils.getDataFrameFromJson(spark, Seq(sourceData))
1088 |
1089 | val dfDistinct = df.select(col("MyLiteral"), array_distinct(col("errCol")).as("errCol"))
1090 |
1091 | val actualDistinct = JsonUtils.prettyJSON(dfDistinct.toJSON.take(1)(0))
1092 |
1093 | assert(actualDistinct == expectedDistinct)
1094 | }
1095 |
1096 | private def processCastExample(df: DataFrame, inputColumn: String, outputColumn: String, expectedSchema: String,
1097 | expectedResults: String): Unit = {
1098 | val dfOut = NestedArrayTransformations.nestedWithColumnAndErrorMap(df, inputColumn, outputColumn, "errors",
1099 | c => {
1100 | c.cast(IntegerType)
1101 | }, c => {
1102 | when(c.isNotNull.and(c.cast(IntegerType).isNull),
1103 | callUDF("confCastErr", lit(outputColumn), c.cast(StringType)))
1104 | .otherwise(null)
1105 | })
1106 |
1107 | val actualSchema = dfOut.schema.treeString
1108 | val actualResults = JsonUtils.prettySparkJSON(dfOut.toJSON.collect)
1109 |
1110 | assertSchema(actualSchema, expectedSchema)
1111 | assertResults(actualResults, expectedResults)
1112 | }
1113 |
1114 | private def assertSchema(actualSchema: String, expectedSchema: String): Unit = {
1115 | if (actualSchema != expectedSchema) {
1116 | log.error("EXPECTED:")
1117 | log.error(expectedSchema)
1118 | log.error("ACTUAL:")
1119 | log.error(actualSchema)
1120 | fail("Actual conformed schema does not match the expected schema (see above).")
1121 | }
1122 | }
1123 |
1124 | private def assertResults(actualResults: String, expectedResults: String): Unit = {
1125 | if (actualResults != expectedResults) {
1126 | log.error("EXPECTED:")
1127 | log.error(expectedResults)
1128 | log.error("ACTUAL:")
1129 | log.error(actualResults)
1130 | fail("Actual conformed dataset JSON does not match the expected JSON (see above).")
1131 | }
1132 | }
1133 | }
1134 |
1135 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/ExtendedTransformationsSuite.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations
18 |
19 | import org.apache.commons.io.IOUtils
20 | import org.apache.spark.sql.functions._
21 | import org.apache.spark.sql.types.StringType
22 | import org.scalatest.funsuite.AnyFunSuite
23 | import org.slf4j.LoggerFactory
24 | import za.co.absa.spark.hats.SparkTestBase
25 | import za.co.absa.spark.hats.transformations.samples.{ErrorMessage, NestedTestCaseFactory, SampleErrorUDFs}
26 | import za.co.absa.spark.hats.utils.JsonUtils
27 |
28 | class ExtendedTransformationsSuite extends AnyFunSuite with SparkTestBase {
29 | implicit val _: SampleErrorUDFs = new SampleErrorUDFs
30 |
31 | private val log = LoggerFactory.getLogger(this.getClass)
32 | private val nestedTestCaseFactory = new NestedTestCaseFactory()
33 |
34 | test("Test extended array transformations work on root level fields") {
35 | val expectedSchema = getResourceString("/test_data/nested/nested1Schema.txt")
36 | val expectedResults = getResourceString("/test_data/nested/nested1Results.json")
37 |
38 | val df = nestedTestCaseFactory.getTestCase
39 |
40 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "", "id_str", (_, gf) =>
41 | concat(gf("id"), lit(" "), gf("key1").cast(StringType), lit(" "), gf("key2"))
42 | )
43 |
44 | val actualSchema = dfOut.schema.treeString
45 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
46 |
47 | assertSchema(actualSchema, expectedSchema)
48 | assertResults(actualResults, expectedResults)
49 | }
50 |
51 | test("Test extended array transformations work on an inner struct level fields") {
52 | val expectedSchema = getResourceString("/test_data/nested/nested2Schema.txt")
53 | val expectedResults = getResourceString("/test_data/nested/nested2Results.json")
54 |
55 | val df = nestedTestCaseFactory.getTestCase
56 |
57 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "struct2", "skey2", (c, gf) =>
58 | concat(gf("key1"), lit(" "), gf("struct2.inner1.key5").cast(StringType), lit(" "),
59 | c.getField("inner1").getField("key6"))
60 | ).select("key1", "struct2")
61 |
62 | val actualSchema = dfOut.schema.treeString
63 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
64 |
65 | assertSchema(actualSchema, expectedSchema)
66 | assertResults(actualResults, expectedResults)
67 | }
68 |
69 | test("Test extended array transformations work on a double nested inner struct level fields") {
70 | val expectedSchema = getResourceString("/test_data/nested/nested3Schema.txt")
71 | val expectedResults = getResourceString("/test_data/nested/nested3Results.json")
72 |
73 | val df = nestedTestCaseFactory.getTestCase
74 |
75 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "struct2.inner1", "skey2", (c, gf) =>
76 | concat(gf("key1"), lit(" "), gf("struct2.inner1.key5").cast(StringType), lit(" "), c.getField("key6"))
77 | ).select("key1", "struct2")
78 |
79 | val actualSchema = dfOut.schema.treeString
80 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
81 |
82 | assertSchema(actualSchema, expectedSchema)
83 | assertResults(actualResults, expectedResults)
84 | }
85 |
86 | test("Test extended array transformations work on a nested struct in an array") {
87 | val expectedSchema = getResourceString("/test_data/nested/nested4Schema.txt")
88 | val expectedResults = getResourceString("/test_data/nested/nested4Results.json")
89 |
90 | val df = nestedTestCaseFactory.getTestCase
91 |
92 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array1", "skey3", (c, gf) =>
93 | concat(gf("key1"), lit(" "), gf("array1.key7").cast(StringType), lit(" "), c.getField("key8"))
94 | ).select("key1", "array1")
95 |
96 | val actualSchema = dfOut.schema.treeString
97 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
98 |
99 | assertSchema(actualSchema, expectedSchema)
100 | assertResults(actualResults, expectedResults)
101 | }
102 |
103 | test("Test extended array transformations work on a nested struct in an array of an array") {
104 | val expectedSchema = getResourceString("/test_data/nested/nested5Schema.txt")
105 | val expectedResults = getResourceString("/test_data/nested/nested5Results.json")
106 |
107 | val df = nestedTestCaseFactory.getTestCase
108 |
109 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array2.inner2", "out", (c, gf) =>
110 | concat(gf("key1"),
111 | lit(" "),
112 | gf("array2.key2").cast(StringType),
113 | lit(" "),
114 | gf("array2.inner2.key9"),
115 | lit(" "),
116 | c.getField("key10"))
117 | ).select("key1", "array2")
118 |
119 | val actualSchema = dfOut.schema.treeString
120 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
121 |
122 | assertSchema(actualSchema, expectedSchema)
123 | assertResults(actualResults, expectedResults)
124 | }
125 |
126 | test("Test extended array transformations work if a nested struct in an array is accessed") {
127 | val expectedSchema = getResourceString("/test_data/nested/nested6Schema.txt")
128 | val expectedResults = getResourceString("/test_data/nested/nested6Results.json")
129 |
130 | val df = nestedTestCaseFactory.getTestCase
131 |
132 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array2.inner2", "out", (c, gf) =>
133 | concat(c.getField("key10"),
134 | lit(" "),
135 | gf("array2.inner2.struct3.k1").cast(StringType))
136 | ).select("array2")
137 |
138 | val actualSchema = dfOut.schema.treeString
139 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
140 |
141 | assertSchema(actualSchema, expectedSchema)
142 | assertResults(actualResults, expectedResults)
143 | }
144 |
145 | test("Test extended array transformations work for a nested struct in an array is accessed") {
146 | val expectedSchema = getResourceString("/test_data/nested/nested7Schema.txt")
147 | val expectedResults = getResourceString("/test_data/nested/nested7Results.json")
148 |
149 | val df = nestedTestCaseFactory.getTestCase
150 |
151 | val dfOut = NestedArrayTransformations.nestedExtendedStructMap(df, "array2.inner2.struct3", "out", (c, gf) =>
152 | concat(c.getField("k1"),
153 | lit(" "),
154 | gf("array2.inner2.key10").cast(StringType))
155 | ).select("array2")
156 |
157 | val actualSchema = dfOut.schema.treeString
158 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
159 |
160 | assertSchema(actualSchema, expectedSchema)
161 | assertResults(actualResults, expectedResults)
162 | }
163 |
164 | test("Test extended array transformations with error column work if a nested struct in an array is accessed") {
165 | val expectedSchema = getResourceString("/test_data/nested/nested8Schema.txt")
166 | val expectedResults = getResourceString("/test_data/nested/nested8Results.json")
167 |
168 | val df = nestedTestCaseFactory.getTestCase
169 |
170 | val dfOut = NestedArrayTransformations.nestedExtendedStructAndErrorMap(df, "array2.inner2", "out", "errCol", (c, gf) =>
171 | concat(c.getField("key10"),
172 | lit(" "),
173 | gf("array2.inner2.struct3.k1").cast(StringType))
174 | ,
175 | (_, gf) => {
176 | when(gf("array2.inner2.struct3.k1") =!= 1,
177 | callUDF("confCastErr", lit("k1!==1"), gf("array2.inner2.struct3.k1").cast(StringType))
178 | ).otherwise(null)
179 | }
180 | ).select("array2", "errCol")
181 |
182 | val actualSchema = dfOut.schema.treeString
183 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
184 |
185 | assertSchema(actualSchema, expectedSchema)
186 | assertResults(actualResults, expectedResults)
187 | }
188 |
189 | test("Test extended array transformations with error column that has existing errors") {
190 | val expectedSchema = getResourceString("/test_data/nested/nested9Schema.txt")
191 | val expectedResults = getResourceString("/test_data/nested/nested9Results.json")
192 |
193 | val df = nestedTestCaseFactory
194 | .getTestCase
195 | .withColumn("errCol", array(typedLit(ErrorMessage("Initial", "000", "ErrMsg", "id", Seq(), Seq()))))
196 |
197 | val dfOut = NestedArrayTransformations.nestedExtendedStructAndErrorMap(df, "array2.inner2", "out", "errCol", (c, gf) =>
198 | concat(c.getField("key10"),
199 | lit(" "),
200 | gf("array2.inner2.struct3.k1").cast(StringType))
201 | ,
202 | (_, gf) => {
203 | when(gf("array2.inner2.struct3.k1") =!= 1,
204 | callUDF("confCastErr", lit("k1!==1"), gf("array2.inner2.struct3.k1").cast(StringType))
205 | ).otherwise(null)
206 | }
207 | ).select("array2", "errCol")
208 |
209 | val actualSchema = dfOut.schema.treeString
210 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
211 |
212 | assertSchema(actualSchema, expectedSchema)
213 | assertResults(actualResults, expectedResults)
214 | }
215 |
216 | test("Test extended array transformations with error column for an array inside a double nested struct") {
217 | val expectedSchema = getResourceString("/test_data/nested/nested10Schema.txt")
218 | val expectedResults = getResourceString("/test_data/nested/nested10Results.json")
219 |
220 | val df = nestedTestCaseFactory
221 | .getTestCase
222 | .withColumn("errCol", array(typedLit(ErrorMessage("Initial", "000", "ErrMsg", "id", Seq(), Seq()))))
223 |
224 | val dfOut = NestedArrayTransformations.nestedExtendedStructAndErrorMap(df, "struct3.inner3.array3",
225 | "struct3.inner3.array3.out", "errCol", (c, gf) =>
226 | concat(c.getField("a1"),
227 | lit(" "),
228 | gf("struct3.inner3.array3.a2").cast(StringType))
229 | ,
230 | (_, gf) => {
231 | when(gf("struct3.inner3.array3.a1") =!= 3,
232 | callUDF("confCastErr", lit("a1!==3"), gf("struct3.inner3.array3.a1").cast(StringType))
233 | ).otherwise(null)
234 | }
235 | ).select("struct3", "errCol")
236 |
237 | val actualSchema = dfOut.schema.treeString
238 | val actualResults = JsonUtils.prettySparkJSON(dfOut.orderBy("id").toJSON.collect())
239 |
240 | assertSchema(actualSchema, expectedSchema)
241 | assertResults(actualResults, expectedResults)
242 | }
243 |
244 | private def getResourceString(name: String): String =
245 | IOUtils.toString(getClass.getResourceAsStream(name), "UTF-8")
246 |
247 | private def assertSchema(actualSchema: String, expectedSchema: String): Unit = {
248 | if (actualSchema != expectedSchema) {
249 | log.error("EXPECTED:")
250 | log.error(expectedSchema)
251 | log.error("ACTUAL:")
252 | log.error(actualSchema)
253 | fail("Actual conformed schema does not match the expected schema (see above).")
254 | }
255 | }
256 |
257 | private def assertResults(actualResults: String, expectedResults: String): Unit = {
258 | if (!expectedResults.startsWith(actualResults)) {
259 | log.error("EXPECTED:")
260 | log.error(expectedResults)
261 | log.error("ACTUAL:")
262 | log.error(actualResults)
263 | fail("Actual conformed dataset JSON does not match the expected JSON (see above).")
264 | }
265 | }
266 | }
267 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/samples/DeepArraySamples.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations.samples
18 |
19 | // Examples for constructing dataframes containing arrays of various levels of nesting
20 | // Also it includes error column to test transformations that can fail per field
21 |
22 | // The case classes were declared at the package level so it can be used to create Spark DataSets
23 | // It is declared package private so the names won't pollute public/exported namespace
24 |
25 | // Structs of Structs example
26 | private[transformations] case class Address(city: String, street: String)
27 |
28 | private[transformations] case class Employee(name: String, address: Address)
29 |
30 | private[transformations] case class TestObj(id: Int, employee: Employee)
31 |
32 | private[transformations] case class TestObj2(id: Int, employee: Seq[Employee])
33 |
34 | // Arrays of primitives example
35 | private[transformations] case class FunWords(id: Int, words: Seq[String])
36 |
37 | // Arrays of arrays of primitives example
38 | private[transformations] case class GeoData(id: Int, matrix: Seq[Seq[String]])
39 |
40 | // Arrays of structs example
41 | private[transformations] case class Person(firstName: String, lastName: String)
42 |
43 | private[transformations] case class Team(id: Int, person: Seq[Person])
44 |
45 | private[transformations] case class Dept(name: String, team: Team)
46 |
47 | // Arrays of Arrays of struct
48 | private[transformations] case class Tournament(id: Int, person: Seq[Seq[Person]])
49 |
50 | // Arrays of structs in arrays of structs
51 | private[transformations] case class Condition(conif: String, conthen: String, amount: Double)
52 |
53 | private[transformations] case class Leg(legid: Int, conditions: Seq[Condition])
54 |
55 | private[transformations] case class Trade(id: Int, legs: Seq[Leg])
56 |
57 | // Structs of Structs example with errror column
58 | private[transformations] case class AddressWithErrColumn(city: String, street: String, buildingNum: Int, zip: String,
59 | errors: Seq[ErrorMessage])
60 |
61 | private[transformations] case class AddressNoErrColumn(city: String, street: String, buildingNum: Int, zip: String)
62 |
63 | private[transformations] case class EmployeeNoErrorColumn(name: String, address: AddressNoErrColumn)
64 |
65 | private[transformations] case class TestObj1WithErrorColumn(id: Int, employee: EmployeeNoErrorColumn, errors:
66 | Seq[ErrorMessage])
67 |
68 | private[transformations] case class TestObj2WithErrorColumn(id: Int, employee: Seq[EmployeeNoErrorColumn],
69 | errors: Seq[ErrorMessage])
70 |
71 | private[transformations] case class TestObj2NoErrColumn(id: Int, employee: Seq[EmployeeNoErrorColumn])
72 |
73 | // Arrays of primitives example with error column
74 | private[transformations] case class FunNumbersWithErrorColumn(id: Int, nums: Seq[String], errors: Seq[ErrorMessage])
75 |
76 | // Arrays of arrays of primitives example with error column
77 | private[transformations] case class GeoDataWithErrorColumn(id: Int, matrix: Seq[Seq[String]], errors: Seq[ErrorMessage])
78 |
79 | // Arrays of structs in arrays of structs with error column
80 | private[transformations] case class TradeWithErrorColumn(id: Int, legs: Seq[Leg], errors: Seq[ErrorMessage])
81 |
82 | object DeepArraySamples {
83 | // scalastyle:off magic.number
84 | // scalastyle:off line.size.limit
85 |
86 | // WITHOUT error column
87 |
88 | // Plain
89 | val plainSampleN: Seq[Address] = Seq(
90 | Address("Olomuc", "Vodickova"),
91 | Address("Ostrava", "Vlavska"),
92 | Address("Plzen", "Kralova")
93 | )
94 |
95 | // Struct of struct
96 | val structOfStructSampleN: Seq[TestObj] = Seq(
97 | TestObj(1, Employee("Martin", Address("Olomuc", "Vodickova"))),
98 | TestObj(1, Employee("Petr", Address("Ostrava", "Vlavska"))),
99 | TestObj(1, Employee("Vojta", Address("Plzen", "Kralova")))
100 | )
101 |
102 | // Array of struct of struct
103 | val arrayOfstructOfStructSampleN: Seq[TestObj2] = Seq(
104 | TestObj2(1, Seq(Employee("Martin", Address("Olomuc", "Vodickova")), Employee("Stephan", Address("Olomuc", "Vodickova")))),
105 | TestObj2(2, Seq(Employee("Petr", Address("Ostrava", "Vlavska")), Employee("Michal", Address("Ostrava", "Vlavska")))),
106 | TestObj2(3, Seq(Employee("Vojta", Address("Plzen", "Kralova"))))
107 | )
108 |
109 | // Arrays of primitives
110 | val arraysOfPrimitivesSampleN: Seq[FunWords] = Seq(
111 | FunWords(1, Seq("Gizmo", "Blurp", "Buzinga")),
112 | FunWords(1, Seq("Quirk", "Zap", "Mmrnmhrm"))
113 | )
114 |
115 | // Arrays of arrays of primitives
116 | val arraysOfArraysOfPrimitivesSampleN: Seq[GeoData] = Seq(
117 | GeoData(1, Seq(Seq("Tree", "Table"), Seq("Map", "Duck"))),
118 | GeoData(2, Seq(Seq("Apple", "Machine"), Seq("List", "Duck"))),
119 | GeoData(3, Seq(Seq("Computer", "Snake"), Seq("Sun", "Star")))
120 | )
121 |
122 | // Arrays of structs
123 | val arraysOfStructsSampleN: Seq[Team] = Seq(
124 | Team(1, Seq(Person("John", "Smith"), Person("Jack", "Brown"))),
125 | Team(1, Seq(Person("Merry", "Cook"), Person("Jane", "Clark")))
126 | )
127 |
128 | // Arrays of arrays of struct
129 | val arraysOfArraysOfStructSampleN: Seq[Tournament] = Seq(
130 | Tournament(1, Seq(Seq(Person("Mona Lisa", "Harddrive")), Seq(Person("Lenny", "Linux"), Person("Dot", "Not")))),
131 | Tournament(1, Seq(Seq(Person("Eddie", "Larrison")), Seq(Person("Scarlett", "Johanson"), Person("William", "Windows"))))
132 | )
133 |
134 | // Arrays of struct with arrays of struct
135 | val arraysOfStrtuctsDeepSampleN: Seq[Trade] = Seq(
136 | Trade(1, Seq(
137 | Leg(100, Seq(
138 | Condition("if bid>10", "buy", 100), Condition("if sell<5", "sell", 150), Condition("if sell<1", "sell", 1000))),
139 | Leg(101, Seq(
140 | Condition("if bid<50", "sell", 200), Condition("if sell>30", "buy", 175), Condition("if sell>25", "buy", 225)))
141 | )),
142 | Trade(2, Seq(
143 | Leg(102, Seq(
144 | Condition("if bid>11", "buy", 100), Condition("if sell<6", "sell", 150), Condition("if sell<2", "sell", 1000))),
145 | Leg(103, Seq(
146 | Condition("if bid<51", "sell", 200), Condition("if sell>31", "buy", 175), Condition("if sell>26", "buy", 225)))
147 | )),
148 | Trade(3, Seq(
149 | Leg(104, Seq(
150 | Condition("if bid>12", "buy", 100), Condition("if sell<7", "sell", 150), Condition("if sell<3", "sell", 1000))),
151 | Leg(105, Seq(
152 | Condition("if bid<52", "sell", 200), Condition("if sell>32", "buy", 175), Condition("if sell>27", "buy", 225)))
153 | ))
154 | )
155 |
156 | // WITH error column
157 |
158 | // Plain
159 | val plainSampleE: Seq[AddressWithErrColumn] = Seq(
160 | AddressWithErrColumn("Olomuc", "Vodickova", 12, "12000", Seq()),
161 | AddressWithErrColumn("Ostrava", "Vlavska", 110, "1455a", Seq()),
162 | AddressWithErrColumn("Plzen", "Kralova", 71, "b881",
163 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value"))))
164 | )
165 |
166 | // Struct of struct
167 | val structOfStructSampleE: Seq[TestObj1WithErrorColumn] = Seq(
168 | TestObj1WithErrorColumn(1, EmployeeNoErrorColumn("Martin", AddressNoErrColumn("Olomuc", "Vodickova", 12, "12000")), Nil),
169 | TestObj1WithErrorColumn(1, EmployeeNoErrorColumn("Petr", AddressNoErrColumn("Ostrava", "Vlavska", 110, "1455a")),
170 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))),
171 | TestObj1WithErrorColumn(1, EmployeeNoErrorColumn("Vojta", AddressNoErrColumn("Plzen", "Kralova", 71, "b881")), Nil)
172 | )
173 |
174 | // Array of struct of struct
175 | val arrayOfStructOfStructErrSampleE: Seq[TestObj2WithErrorColumn] = Seq(
176 | TestObj2WithErrorColumn(1, Seq(
177 | EmployeeNoErrorColumn("Martin", AddressNoErrColumn("Olomuc", "Vodickova", 732, "73200")),
178 | EmployeeNoErrorColumn("Stephan", AddressNoErrColumn("Olomuc", "Vodickova", 77, "77-333"))), Nil),
179 | TestObj2WithErrorColumn(2, Seq(
180 | EmployeeNoErrorColumn("Petr", AddressNoErrColumn("Ostrava", "Vlavska", 25, "a9991")),
181 | EmployeeNoErrorColumn("Michal", AddressNoErrColumn("Ostrava", "Vlavska", 334, "552-aa1"))),
182 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))),
183 | TestObj2WithErrorColumn(3, Seq(
184 | EmployeeNoErrorColumn("Vojta", AddressNoErrColumn("Plzen", "Kralova", 33, "993"))), Nil)
185 | )
186 |
187 | val arrayOfStructOfStruvtNoErrSampleE: Seq[TestObj2NoErrColumn] = Seq(
188 | TestObj2NoErrColumn(1, Seq(
189 | EmployeeNoErrorColumn("Martin", AddressNoErrColumn("Olomuc", "Vodickova", 732, "73200")),
190 | EmployeeNoErrorColumn("Stephan", AddressNoErrColumn("Olomuc", "Vodickova", 77, "77-333")))),
191 | TestObj2NoErrColumn(2, Seq(
192 | EmployeeNoErrorColumn("Petr", AddressNoErrColumn("Ostrava", "Vlavska", 25, "a9991")),
193 | EmployeeNoErrorColumn("Michal", AddressNoErrColumn("Ostrava", "Vlavska", 334, "552-aa1")))),
194 | TestObj2NoErrColumn(3, Seq(
195 | EmployeeNoErrorColumn("Vojta", AddressNoErrColumn("Plzen", "Kralova", 33, "993"))))
196 | )
197 |
198 | // Arrays of primitives
199 | val arraysOfPrimitivesSampleE: Seq[FunNumbersWithErrorColumn] = Seq(
200 | FunNumbersWithErrorColumn(1, Seq("7755", "a212", "222-111"),
201 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))),
202 | FunNumbersWithErrorColumn(1, Seq("223a", "223a", "775"), Nil),
203 | FunNumbersWithErrorColumn(1, Seq("5", "-100", "9999999"), Nil)
204 | )
205 |
206 | // Arrays of arrays of primitives
207 | val arraysOfArraysOfPrimitivesSampleE: Seq[GeoDataWithErrorColumn] = Seq(
208 | GeoDataWithErrorColumn(1, Seq(Seq("10", "11b"), Seq("11b", "12")),
209 | Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value")))),
210 | GeoDataWithErrorColumn(2, Seq(Seq("20f", "300"), Seq("1000", "10-10")), Nil),
211 | GeoDataWithErrorColumn(3, Seq(Seq("775", "223"), Seq("100", "0")), Nil)
212 | )
213 |
214 | // Arrays of struct with arrays of struct
215 | val arraysOfStrtuctsDeepSampleE: Seq[TradeWithErrorColumn] = Seq(
216 | TradeWithErrorColumn(1, Seq(
217 | Leg(100, Seq(
218 | Condition("if bid>10", "100", 100), Condition("if sell<5", "300a", 150), Condition("if sell<1", "1000", 1000))),
219 | Leg(101, Seq(
220 | Condition("if bid<50", "200", 200), Condition("if sell>30", "175b", 175), Condition("if sell>25", "225-225", 225)))
221 | ), Nil),
222 | TradeWithErrorColumn(2, Seq(
223 | Leg(102, Seq(
224 | Condition("if bid>11", "100", 100), Condition("if sell<6", "150", 150), Condition("if sell<2", "1000", 1000))),
225 | Leg(103, Seq(
226 | Condition("if bid<51", "200", 200), Condition("if sell>31", "175", 175), Condition("if sell>26", "225", 225)))
227 | ), Nil),
228 | TradeWithErrorColumn(3, Seq(
229 | Leg(104, Seq(
230 | Condition("if bid>12", "1OO", 100), Condition("if sell<7", "150x", 150), Condition("if sell<3", "-1000-", 1000))),
231 | Leg(105, Seq(
232 | Condition("if bid<52", "2OO", 200), Condition("if sell>32", "f175", 175), Condition("if sell>27", "225_", 225)))
233 | ), Seq(ErrorMessage("myErrorType", "E-1", "Testing This stuff", "whatEvColumn", Seq("some value"))))
234 | )
235 | }
236 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/samples/ErrorMessage.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations.samples
18 |
19 | /**
20 | * Case class to represent an error message
21 | *
22 | * @param errType - Type or source of the error
23 | * @param errCode - Internal error code
24 | * @param errMsg - Textual description of the error
25 | * @param errCol - The name of the column where the error occurred
26 | * @param rawValues - Sequence of raw values (which are the potential culprits of the error)
27 | * @param mappings - Sequence of Mappings i.e Mapping Table Column -> Equivalent Mapped Dataset column
28 | */
29 | case class ErrorMessage(errType: String, errCode: String, errMsg: String, errCol: String, rawValues: Seq[String], mappings: Seq[Mapping] = Seq())
30 | case class Mapping(mappingTableColumn: String, mappedDatasetColumn: String)
31 |
32 | object ErrorMessage {
33 | val errorColumnName = "errCol"
34 |
35 | def confCastErr(errCol: String, rawValue: String): ErrorMessage = ErrorMessage(
36 | errType = "confCastError",
37 | errCode = "E00003",
38 | errMsg = "Conformance Error - Null returned by casting conformance rule",
39 | errCol = errCol,
40 | rawValues = Seq(rawValue))
41 |
42 | }
43 |
44 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/samples/NestedMapTestCaseFactory.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations.samples
18 |
19 | import org.apache.spark.sql.types._
20 | import org.apache.spark.sql.{DataFrame, Row, SparkSession, types}
21 |
22 | class NestedMapTestCaseFactory(implicit spark: SparkSession) {
23 |
24 | private val testCaseSchema = new StructType()
25 | .add("name", StringType)
26 | .add("addresses", ArrayType(new StructType()
27 | .add("city", StringType)
28 | .add("state", StringType)))
29 | .add("properties", MapType(StringType, StringType))
30 |
31 | private val tstCaseData = Seq(
32 | Row("John", List(Row("Newark", "NY"), Row("Brooklyn", "NY")), Map("hair" -> "black", "eyes" -> "brown", "height" -> "178")),
33 | Row("Kate", List(Row("San Jose", "CA"), Row("Sandiago", "CA")), Map("hair" -> "brown", "eyes" -> "black", "height" -> "165")),
34 | Row("William", List(Row("Las Vegas", "NV")), Map("hair" -> "red", "eye" -> "gray", "height" -> "185")),
35 | Row("Sarah", null, Map("hair" -> "blond", "eyes" -> "red", "height" -> "162")),
36 | Row("Michael", List(Row("Sacramento", "CA"), Row("San Diego", "CA")), Map("white" -> "black", "eyes" -> "black", "height" -> "180"))
37 | )
38 |
39 | def getTestCase: DataFrame = {
40 | spark.createDataFrame(
41 | spark.sparkContext.parallelize(tstCaseData),
42 | testCaseSchema
43 | ).orderBy("name")
44 | }
45 |
46 | }
47 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/samples/NestedTestCaseFactory.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations.samples
18 |
19 | import org.apache.spark.sql.types._
20 | import org.apache.spark.sql.{DataFrame, SparkSession, types}
21 |
22 | class NestedTestCaseFactory(implicit spark: SparkSession) {
23 |
24 | private val testCaseSchema = StructType(
25 | Array(
26 | StructField("id", LongType),
27 | StructField("key1", LongType),
28 | StructField("key2", LongType),
29 | StructField("struct1", StructType(Array(
30 | StructField("key3", IntegerType),
31 | StructField("key4", IntegerType)
32 | ))),
33 | StructField("struct2", StructType(Array(
34 | StructField("inner1", StructType(Array(
35 | StructField("key5", LongType),
36 | StructField("key6", LongType),
37 | StructField("skey1", StringType)
38 | )))
39 | ))),
40 | StructField("struct3", StructType(Array(
41 | StructField("inner3", StructType(Array(
42 | StructField("array3", types.ArrayType(StructType(Array(
43 | StructField("a1", LongType),
44 | StructField("a2", LongType),
45 | StructField("a3", StringType)
46 | ))))
47 | )))
48 | ))),
49 | StructField("array1", types.ArrayType(StructType(Array(
50 | StructField("key7", LongType),
51 | StructField("key8", LongType),
52 | StructField("skey2", StringType)
53 | )))),
54 | StructField("array2", types.ArrayType(StructType(Array(
55 | StructField("key2", LongType),
56 | StructField("inner2", types.ArrayType(StructType(Array(
57 | StructField("key9", LongType),
58 | StructField("key10", LongType),
59 | StructField("struct3", StructType(Array(
60 | StructField("k1", IntegerType),
61 | StructField("k2", IntegerType)
62 | )))
63 | ))))
64 | ))))
65 | ))
66 |
67 | def getTestCase: DataFrame = {
68 | spark.read
69 | .schema(testCaseSchema)
70 | .json(getClass.getResource("/test_data/nested/nestedDf1.json").getPath)
71 | }
72 |
73 | }
74 |
--------------------------------------------------------------------------------
/src/test/scala/za/co/absa/spark/hats/transformations/samples/SampleErrorUDFs.scala:
--------------------------------------------------------------------------------
1 | /*
2 | * Copyright 2020 ABSA Group Limited
3 | *
4 | * Licensed under the Apache License, Version 2.0 (the "License");
5 | * you may not use this file except in compliance with the License.
6 | * You may obtain a copy of the License at
7 | *
8 | * http://www.apache.org/licenses/LICENSE-2.0
9 | *
10 | * Unless required by applicable law or agreed to in writing, software
11 | * distributed under the License is distributed on an "AS IS" BASIS,
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | * See the License for the specific language governing permissions and
14 | * limitations under the License.
15 | */
16 |
17 | package za.co.absa.spark.hats.transformations.samples
18 |
19 | import org.apache.spark.sql.SparkSession
20 |
21 | import scala.collection.mutable
22 |
23 | case class SampleErrorUDFs()(implicit val spark: SparkSession) {
24 |
25 | spark.udf.register("confCastErr", { (errCol: String, rawValue: String) =>
26 | ErrorMessage.confCastErr(errCol, rawValue)
27 | })
28 |
29 | spark.udf.register("arrayDistinctErrors",
30 | (arr: mutable.WrappedArray[ErrorMessage]) =>
31 | if (arr != null) {
32 | arr.distinct.filter((a: AnyRef) => a != null)
33 | } else {
34 | Seq[ErrorMessage]()
35 | }
36 | )
37 |
38 | }
39 |
--------------------------------------------------------------------------------
/version.sbt:
--------------------------------------------------------------------------------
1 | ThisBuild / version := "0.3.1-SNAPSHOT"
2 |
--------------------------------------------------------------------------------