├── .coveragerc
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── conf
    ├── data-quality
    │   ├── example-dq-report.html
    │   └── rules
    │   │   ├── production_configs
    │   │       ├── recipe-task1-dq-rules.json
    │   │       └── recipe-task2-dq-rules.json
    │   │   └── unit_test_configs
    │   │       ├── recipe-task1-dq-rules.json
    │   │       └── recipe-task2-dq-rules.json
    ├── python
    │   └── logging-properties.json
    └── spark
    │   ├── log4j.properties
    │   └── sparkConf.conf
├── docs
    ├── APIDOC.MD
    ├── ETL_README.md
    ├── PysparkLocalSetup.docx
    ├── SETUP.MD
    ├── apidocumentation.html
    ├── images
    │   ├── DataQualityUML.png
    │   ├── XMLParse.png
    │   ├── dq-task1.png
    │   ├── dq-task2.png
    │   ├── task1_ouput_er.png
    │   └── task2_ouput_er.png
    └── setup.html
├── logs
    ├── bash
    │   └── logs
    └── python
    │   └── log-sample
├── requirements.txt
├── resources
    ├── data-quality-reports
    │   └── recipe-tasks
    │   │   ├── task1-dq-report.html
    │   │   └── task2-dq-report.html
    └── data
    │   ├── clinical_trial
    │       ├── data
    │       │   └── chunk1.zip
    │       ├── job_parameters
    │       │   └── clinical_trial.json
    │       ├── sql
    │       │   └── transformations
    │       │   │   └── sponsors.sql
    │       └── xml
    │       │   ├── clinical_study_xsd.xsd
    │       │   └── default_clinical_study.xml
    │   ├── config
    │       ├── application_properties.json
    │       ├── application_properties.yaml
    │       └── logging.yaml
    │   ├── product.csv
    │   ├── purchase.csv
    │   ├── recipes
    │       ├── input
    │       │   ├── recipes-000.json
    │       │   ├── recipes-001.json
    │       │   └── recipes-002.json
    │       └── output
    │       │   ├── task1
    │       │       ├── ._SUCCESS.crc
    │       │       ├── .part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc
    │       │       ├── _SUCCESS
    │       │       └── part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet
    │       │   └── task2
    │       │       ├── ._SUCCESS.crc
    │       │       ├── .part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc
    │       │       ├── _SUCCESS
    │       │       └── part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv
    │   └── store.csv
├── sbin
    ├── common_functions.sh
    ├── create_python_venv.sh
    └── execute-tasks-spark-submit.sh
├── setup.py
├── src
    └── com
    │   ├── __init__.py
    │   └── vitthalmirji
    │       ├── __init__.py
    │       ├── datapipelines
    │           ├── __init__.py
    │           ├── clinical_trial
    │           │   ├── __init__.py
    │           │   └── clinical_trial_etl.py
    │           └── recipe_tasks.py
    │       ├── datawarehousing
    │           ├── __init__.py
    │           └── change_data_capture.py
    │       ├── etl
    │           ├── CColumn.py
    │           ├── ETL.py
    │           ├── ETLTransform.py
    │           ├── ITable.py
    │           ├── __init__.py
    │           └── meta
    │           │   ├── MetaModel.py
    │           │   └── __init__.py
    │       ├── imports
    │           ├── HdfsImport.py
    │           └── __init__.py
    │       ├── kafka
    │           ├── Logger.py
    │           └── __init__.py
    │       ├── main.py
    │       ├── mapper
    │           ├── Mapper.py
    │           └── __init__.py
    │       ├── objects
    │           ├── __init__.py
    │           └── enums
    │           │   ├── Environments.py
    │           │   ├── Zones.py
    │           │   └── __init__.py
    │       └── utils
    │           ├── MockupData.py
    │           ├── Utilities.py
    │           ├── __init__.py
    │           ├── audit_util.py
    │           ├── comprehensive_logging.py
    │           ├── constants.py
    │           ├── data_quality.py
    │           ├── helpers.py
    │           ├── logging_util.py
    │           ├── spark.py
    │           ├── spark_submit_utils.py
    │           └── transformation_extension.py
└── tests
    ├── EtlTransformTest.py
    ├── UtilsTest.py
    ├── XmlMapperTest.py
    ├── aws_test
        ├── AwsS3Test.py
        ├── __init__.py
        ├── glue_job.py
        ├── test_glue_job.py
        ├── test_mocked_postgres.py
        ├── test_mocked_redshift.py
        ├── test_mocked_redshift_infra.py
        └── testing_mocked_s3.py
    ├── resources
        ├── config.yml
        ├── datamodel.csv
        ├── meta.csv
        ├── mock_dataframe.txt
        ├── product.csv
        ├── purchase.csv
        └── store.csv
    ├── test_comprehensive_logging.py
    ├── test_data_quality.py
    ├── test_helpers.py
    ├── test_logging_util.py
    ├── test_recipe_tasks.py
    ├── test_spark.py
    ├── test_spark_submit_execution_pool.py
    └── test_spark_submit_utils.py


/.coveragerc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/.coveragerc


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | ### Python template
 2 | 
 3 | # Bash script logs
 4 | *.log
 5 | 
 6 | # Byte-compiled / optimized / DLL files
 7 | __pycache__/
 8 | *.py[cod]
 9 | *$py.class
10 | 
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 | src/test/metastore_db
32 | src/src/main/test/hive
33 | src/test/spark-warehouse
34 | src/test/derby.log
35 | 
36 | # PyInstaller
37 | #  Usually these files are written by a python script from a template
38 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 | 
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 | 
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | .hypothesis/
57 | .pytest_cache/
58 | 
59 | # Translations
60 | *.mo
61 | *.pot
62 | 
63 | # PyBuilder
64 | target/
65 | 
66 | # pyenv
67 | .python-version
68 | 
69 | # Environments
70 | .env
71 | .venv
72 | env/
73 | venv/
74 | ENV/
75 | env.bak/
76 | venv.bak/


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include sbin *.sh
2 | recursive-include conf *.html *.json *.conf *.properties *.html
3 | recursive-include resources *.html
4 | recursive-include logs logs log-sample
5 | recursive-include docs *.md
6 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Datalake ETL Pipeline
 2 | Data transformation simplified for any Data platform.
 3 | 
 4 | `Features:` The package has complete ETL process - 
 5 | 1. Uses metadata, transformation & data model information to design ETL pipeline
 6 | 2. Builds target transformation SparkSQL and Spark Dataframes
 7 | 3. Builds source & target Hive DDLs
 8 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions.
 9 | 5. Supports below fundamental transformations for ETL pipeline -
10 |    * Filters on source & target dataframes
11 |    * Grouping and Aggregations on source & target dataframes
12 |    * Heavily nested queries / dataframes
13 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
14 |    level of nesting
15 | 7. Has Unit test cases designed on function/method level & measures
16 |   source code coverage
17 | 8. Has information about delpoying to higher environments
18 | 9. Has API documentation for customization & enhancement
19 | 
20 | `Enhancements:` In progress -
21 | 1. Integrate Audit and logging - Define Error codes, log process
22 |    failures, Audit progress & runtime information


--------------------------------------------------------------------------------
/conf/data-quality/example-dq-report.html:
--------------------------------------------------------------------------------
1 | <html><body><P>Team,<br/><br/>Data Quality check finished successfully for <b>DQ ID = 101</b>, with failures. Check details in below table of metrics.</P><h3 style="font-family:arial">Failed DQ details</h3><table border="3" style="width:100%"><tr style="text-align:left;background-color:#FF6347"><th>Yarn Application Id</th> <th>DQ ID</th> <th>Rule ID</th> <th>Rule Name</th> <th>Rule type</th> <th>Description</th> <th>Columns/Query</th> <th>Pass Count</th> <th>Fail Count</th> <th>Total Count</th></tr><tr><td>local-1681916910001</td> <td>101</td> <td>1011</td> <td>Primary / Natural Keys</td> <td>unique</td> <td>Primary / Natural Keys should not have duplicates</td> <td>['name']</td> <td>1039</td> <td>3</td> <td>1042</td></tr> <tr><td>local-1681916910001</td> <td>101</td> <td>1012</td> <td>NOT NULL fields</td> <td>not null</td> <td>Field should have valid value</td> <td>['name', 'cookTime', 'prepTime']</td> <td>715</td> <td>327</td> <td>1042</td></tr></table><h3 style="font-family:arial">Succeeded DQ details</h3><table border="3" style="width:100%"><tr style="text-align:left;background-color:#33FFBD"><th>Yarn Application Id</th> <th>DQ ID</th> <th>Rule ID</th> <th>Rule Name</th> <th>Rule type</th> <th>Description</th> <th>Columns/Query</th> <th>Pass Count</th> <th>Fail Count</th> <th>Total Count</th></tr><tr><td>local-1681916910001</td> <td>101</td> <td>1013</td> <td>File names check</td> <td>query</td> <td>Check If all input files are read for processing</td> <td>["WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"]</td> <td>1042</td> <td>0</td> <td>1042</td></tr></table><br/><br/>Thanks</body></html><br/>


--------------------------------------------------------------------------------
/conf/data-quality/rules/production_configs/recipe-task1-dq-rules.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dq_id": 101,
 3 |   "rules": [
 4 |     {
 5 |       "rule_id": 1011,
 6 |       "name": "Primary / Natural Keys",
 7 |       "description": "Primary / Natural Keys should not have duplicates",
 8 |       "rule_type": "unique",
 9 |       "columns": [
10 |         "name"
11 |       ]
12 |     },
13 |     {
14 |       "rule_id": 1012,
15 |       "name": "NOT NULL fields",
16 |       "description": "Field should have valid value",
17 |       "rule_type": "not null",
18 |       "columns": [
19 |         "name",
20 |         "cookTime",
21 |         "prepTime"
22 |       ]
23 |     },
24 |     {
25 |       "rule_id": 1013,
26 |       "name": "Input files check",
27 |       "description": "Check If all input files are read for processing",
28 |       "rule_type": "query",
29 |       "query": "WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"
30 |     },
31 |     {
32 |       "rule_id": 1014,
33 |       "name": "\"Check for invalid cook & prep time",
34 |       "description": "Check empty or null values",
35 |       "rule_type": "query",
36 |       "query": "SELECT * FROM temp WHERE cookTime = '' OR prepTime = ''"
37 |     }
38 |   ]
39 | }
40 | 


--------------------------------------------------------------------------------
/conf/data-quality/rules/production_configs/recipe-task2-dq-rules.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dq_id": 101,
 3 |   "rules": [
 4 |     {
 5 |       "rule_id": 1015,
 6 |       "name": "Primary / Natural Keys",
 7 |       "description": "Primary / Natural Keys should not have duplicates",
 8 |       "rule_type": "unique",
 9 |       "columns": [
10 |         "difficulty"
11 |       ]
12 |     },
13 |     {
14 |       "rule_id": 1016,
15 |       "name": "NOT NULL fields",
16 |       "description": "Field should have valid value",
17 |       "rule_type": "not null",
18 |       "columns": [
19 |         "difficulty",
20 |         "avg_total_cooking_time"
21 |       ]
22 |     }
23 |   ]
24 | }
25 | 


--------------------------------------------------------------------------------
/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dq_id": 101,
 3 |   "execution_reports_dir": "<project-root-place-holder>/resources/data-quality-reports/recipe-tasks",
 4 |   "email_execution_report_to": "vitthalmirji@gmail.com",
 5 |   "rules": [
 6 |     {
 7 |       "rule_id": 1011,
 8 |       "name": "Primary / Natural Keys",
 9 |       "description": "Primary / Natural Keys should not have duplicates",
10 |       "rule_type": "unique",
11 |       "columns": [
12 |         "name"
13 |       ]
14 |     },
15 |     {
16 |       "rule_id": 1012,
17 |       "name": "NOT NULL fields",
18 |       "description": "Field should have valid value",
19 |       "rule_type": "not null",
20 |       "columns": [
21 |         "name",
22 |         "cookTime",
23 |         "prepTime"
24 |       ]
25 |     },
26 |     {
27 |       "rule_id": 1013,
28 |       "name": "Input files check",
29 |       "description": "Check If all input files are read for processing",
30 |       "rule_type": "query",
31 |       "query": "WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"
32 |     },
33 |     {
34 |       "rule_id": 1014,
35 |       "name": "\"Check for invalid cook & prep time",
36 |       "description": "Check empty or null values",
37 |       "rule_type": "query",
38 |       "query": "SELECT * FROM temp WHERE cookTime = '' OR prepTime = ''"
39 |     }
40 |   ]
41 | }
42 | 


--------------------------------------------------------------------------------
/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "dq_id": 101,
 3 |   "execution_reports_dir": "<project-root-place-holder>/resources/data-quality-reports/recipe-tasks",
 4 |   "email_execution_report_to": "vitthalmirji@gmail.com",
 5 |   "rules": [
 6 |     {
 7 |       "rule_id": 1015,
 8 |       "name": "Primary / Natural Keys",
 9 |       "description": "Primary / Natural Keys should not have duplicates",
10 |       "rule_type": "unique",
11 |       "columns": [
12 |         "difficulty"
13 |       ]
14 |     },
15 |     {
16 |       "rule_id": 1016,
17 |       "name": "NOT NULL fields",
18 |       "description": "Field should have valid value",
19 |       "rule_type": "not null",
20 |       "columns": [
21 |         "difficulty",
22 |         "avg_total_cooking_time"
23 |       ]
24 |     }
25 |   ]
26 | }
27 | 


--------------------------------------------------------------------------------
/conf/python/logging-properties.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "version": 1,
 3 |   "objects": {
 4 |     "queue": {
 5 |       "class": "queue.Queue",
 6 |       "maxsize": 1000
 7 |     }
 8 |   },
 9 |   "formatters": {
10 |     "simple": {
11 |       "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
12 |     },
13 |     "detailed": {
14 |       "format": "%(asctime)s %(name)-15s %(levelname)-8s %(process)-10d %(funcName)-30s %(message)s"
15 |     }
16 |   },
17 |   "handlers": {
18 |     "console": {
19 |       "class": "logging.StreamHandler",
20 |       "level": "DEBUG",
21 |       "formatter": "detailed",
22 |       "stream": "ext://sys.stdout"
23 |     },
24 |     "file": {
25 |       "class": "logging.FileHandler",
26 |       "level": "DEBUG",
27 |       "encoding": "utf-8",
28 |       "formatter": "detailed",
29 |       "filename": "logs/log-{job_name_placeholder}_{timestamp_placeholder}.log",
30 |       "mode": "a"
31 |     }
32 |   },
33 |   "loggers": {
34 |     "simple": {
35 |       "level": "INFO",
36 |       "handlers": [
37 |         "console"
38 |       ],
39 |       "propagate": "no"
40 |     },
41 |     "unit-tests": {
42 |       "level": "DEBUG",
43 |       "handlers": [
44 |         "console"
45 |       ],
46 |       "propagate": "no"
47 |     }
48 |   },
49 |   "root": {
50 |     "level": "DEBUG",
51 |     "handlers": [
52 |       "console",
53 |       "file"
54 |     ]
55 |   }
56 | }
57 | 


--------------------------------------------------------------------------------
/conf/spark/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | #Global logging
 3 | log4j.rootCategory=WARN, console
 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 5 | log4j.appender.console.target=System.err
 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 7 | log4j.appender.console.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
 8 | 
 9 | # Spark 3.x
10 | log4j.logger.org.sparkproject.jetty.server.handler.ContextHandler=WARN
11 | 
12 | # Spark 2.x
13 | log4j.logger.org.spark_project.jetty.server.handler.ContextHandler=WARN
14 | 
15 | # Send WARN or higher to stderr
16 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender
17 | log4j.appender.stderr.Threshold=WARN
18 | log4j.appender.stderr.Target=System.err
19 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
20 | log4j.appender.stderr.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
21 | 
22 | # Parquet related logging
23 | log4j.logger.parquet.name = org.apache.parquet.CorruptStatistics
24 | log4j.logger.parquet.level = WARN
25 | log4j.logger.parquet2.name = parquet.CorruptStatistics
26 | log4j.logger.parquet2.level = WARN
27 | 
28 | # Hive metastore related logging
29 | logger.metastore.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
30 | logger.metastore.level = FATAL
31 | logger.hive_functionregistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
32 | logger.hive_functionregistry.level = ERROR
33 | 
34 | # Settings to quiet third party logs that are too verbose
35 | log4j.logger.org.eclipse.jetty=WARN
36 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
37 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
38 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
39 | 
40 | # Reduce verbosity for other spammy core classes.
41 | log4j.logger.org.apache.spark=WARN
42 | log4j.logger.org.apache.spark.util=ERROR
43 | log4j.logger.org.apache.spark.network=WARN
44 | log4j.logger.akka=WARN
45 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
46 | log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN
47 | 
48 | # Hello Fresh com.vitthalmirji logging into separate file
49 | log4j.logger.com.vitthalmirji=INFO, vimAppender
50 | log4j.additivity.com.vitthalmirji=false
51 | log4j.appender.vimAppender=org.apache.log4j.FileAppender
52 | log4j.appender.vimAppender.File=${spark.yarn.app.container.log.dir}/stdout
53 | log4j.appender.vimAppender.layout=org.apache.log4j.PatternLayout
54 | log4j.appender.vimAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
55 | 
56 | # Spark Bigquery logging into separate file
57 | log4j.logger.com.google.cloud.spark.bigquery=INFO, sparkbigqueryAppender
58 | log4j.additivity.com.google.cloud.spark.bigquery=false
59 | log4j.appender.sparkbigqueryAppender=org.apache.log4j.FileAppender
60 | log4j.appender.sparkbigqueryAppender.File=${spark.yarn.app.container.log.dir}/spark-big-query.log
61 | log4j.appender.sparkbigqueryAppender.layout=org.apache.log4j.PatternLayout
62 | log4j.appender.sparkbigqueryAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
63 | 
64 | # Bigquery logging into separate file
65 | log4j.logger.com.google.cloud.bigquery=INFO, bigqueryAppender
66 | log4j.additivity.com.google.cloud.bigquery=false
67 | log4j.appender.bigqueryAppender=org.apache.log4j.FileAppender
68 | log4j.appender.bigqueryAppender.File=${spark.yarn.app.container.log.dir}/big-query.log
69 | log4j.appender.bigqueryAppender.layout=org.apache.log4j.PatternLayout
70 | log4j.appender.bigqueryAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
71 | 
72 | # Hudi logging into separate file
73 | log4j.logger.org.apache.hudi=INFO, hudiAppender
74 | log4j.additivity.org.apache.hudi=false
75 | log4j.appender.hudiAppender=org.apache.log4j.FileAppender
76 | log4j.appender.hudiAppender.File=${spark.yarn.app.container.log.dir}/hudi.log
77 | log4j.appender.hudiAppender.layout=org.apache.log4j.PatternLayout
78 | log4j.appender.hudiAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
79 | 
80 | # Cosmos logging into separate file
81 | log4j.logger.com.microsoft.azure.cosmosdb=INFO, cosmosdbAppender
82 | log4j.additivity.com.microsoft.azure.cosmosdb=false
83 | log4j.appender.cosmosdbAppender=org.apache.log4j.FileAppender
84 | log4j.appender.cosmosdbAppender.File=${spark.yarn.app.container.log.dir}/cosmosdb.log
85 | log4j.appender.cosmosdbAppender.layout=org.apache.log4j.PatternLayout
86 | log4j.appender.cosmosdbAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
87 | 
88 | # GCS logging into separate file
89 | log4j.logger.com.google.cloud.storage=INFO, gcsAppender
90 | log4j.additivity.com.google.cloud.storage=false
91 | log4j.appender.gcsAppender=org.apache.log4j.FileAppender
92 | log4j.appender.gcsAppender.File=${spark.yarn.app.container.log.dir}/gcs.log
93 | log4j.appender.gcsAppender.layout=org.apache.log4j.PatternLayout
94 | log4j.appender.gcsAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
95 | 


--------------------------------------------------------------------------------
/conf/spark/sparkConf.conf:
--------------------------------------------------------------------------------
 1 | GLOBAL {
 2 |   "master" = "yarn"
 3 |   "hive.exec.dynamic.partition.mode" = "nonstrict"
 4 |   "hive.exec.dynamic.partition" = "true"
 5 |   "spark.sql.sources.partitionOverwriteMode" = "dynamic"
 6 |   "mapreduce.fileoutputcommitter.algorithm.version" = "2"
 7 |   "parquet.enable.summary-metadata" = "false"
 8 |   "parquet.compression" = "snappy"
 9 |   "spark.sql.parquet.mergeSchema" = "false"
10 |   "spark.sql.parquet.filterPushdown" = "true"
11 |   "spark.sql.hive.metastorePartitionPruning" = "true"
12 |   "spark.sql.orc.filterPushdown" = "true"
13 |   "spark.sql.orc.splits.include.file.footer" = "true"
14 |   "spark.sql.orc.cache.stripe.details.size" = "10000"
15 |   "spark.sql.broadcastTimeout" = "1800"
16 |   }
17 | 
18 | LOCAL {
19 |   "master" = "local[*]"
20 |   "spark.hadoop.hive.exec.dynamic.partition.mode" = "nonstrict"
21 |   "spark.hadoop.hive.exec.dynamic.partition" = "true"
22 |   "spark.sql.sources.partitionOverwriteMode" = "dynamic"
23 |   "spark.executor.instances" = "1"
24 |   }
25 | 


--------------------------------------------------------------------------------
/docs/APIDOC.MD:
--------------------------------------------------------------------------------
  1 | # Datalake ETL Pipeline
  2 | Data transformation simplified for any Data platform.
  3 | 
  4 | `Features:` The package has complete ETL process - 
  5 | 1. Uses metadata, transformation & data model information to design ETL pipeline
  6 | 2. Builds target transformation SparkSQL and Spark Dataframes
  7 | 3. Builds source & target Hive DDLs
  8 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions.
  9 | 5. Supports below fundamental transformations for ETL pipeline -
 10 |    * Filters on source & target dataframes
 11 |    * Grouping and Aggregations on source & target dataframes
 12 |    * Heavily nested queries / dataframes
 13 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
 14 |    level of nesting
 15 | 7. Has Unit test cases designed on function/method level & measures
 16 |   source code coverage
 17 | 8. Has information about delpoying to higher environments
 18 | 9. Has API documentation for customization & enhancement
 19 | 
 20 | `Enhancements:` In progress -
 21 | 1. Integrate Audit and logging - Define Error codes, log process
 22 |    failures, Audit progress & runtime information
 23 | 
 24 | # Datalake ETL Pipeline API documentation
 25 | ## Mappers for complex/nested data sources
 26 | * Has interface `IMapper` and implemented concrete class `XmlMapper`. We
 27 | can use same abstract / interface for other category of file mapping
 28 | viz. XML/JSON/Parquet/ORC.
 29 | * Core methods/function common for overriding
 30 | are – `getDataframeSchema`, `createDDL`, `complexTypeIterator`,
 31 | `handleStructType`, `handleArrayType`
 32 | 
 33 | * Overview of complex type parsing & exploding -
 34 | * ![Complex type parser](images/XMLParse.png)
 35 | ```
 36 | def handleStructType(self, viewname, viewpath, database, table, xpath, level, dtype, acc={}, xpaths=[])
 37 | ```
 38 | ```
 39 | def handleArrayType(self, viewname, viewpath, database, table, xpath, level, dtype: ArrayType, acc={}, xpaths=[])
 40 | ```
 41 | ```
 42 | def complexTypeIterator(self, viewname, viewpath, database, table, xpath, level, dtype: DataType, acc={}, xpaths=[])
 43 | ```
 44 | 
 45 | ### XmlMapper
 46 | * `XmlMapper` specific methods / functions – `createViewsAndXpaths`,
 47 |   `buildXmlSerdeDDL`
 48 | 
 49 | ```
 50 | def createViewsAndXpaths(self, df: DataFrame, database, table)
 51 | ```
 52 | ```
 53 | def buildXmlSerdeDdl(self, database, table, xmlsourcelocation, xmlrowstarttag, xmlrowendtag)
 54 | ```
 55 |   
 56 | ## Pyspark Core Class Extensions
 57 | 
 58 | ```
 59 | from etl.meta import *
 60 | ```
 61 | 
 62 | ### Column Extensions
 63 | 
 64 | **isFalsy()**
 65 | 
 66 | ```python
 67 | source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy())
 68 | ```
 69 | 
 70 | Returns `True` if `has_stuff` is `None` or `False`.
 71 | 
 72 | **isTruthy()**
 73 | 
 74 | ```python
 75 | source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy())
 76 | ```
 77 | 
 78 | Returns `True` unless `has_stuff` is `None` or `False`.
 79 | 
 80 | **isNullOrBlank()**
 81 | 
 82 | ```python
 83 | source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank())
 84 | ```
 85 | 
 86 | Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).
 87 | 
 88 | **isNotIn()**
 89 | 
 90 | ```python
 91 | source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies))
 92 | ```
 93 | 
 94 | Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list.
 95 | 
 96 | **nullBetween()**
 97 | 
 98 | ```python
 99 | source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")))
100 | ```
101 | 
102 | Returns `True` if `age` is between `lower_age` and `upper_age`.  If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`.  If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.
103 | 
104 | ### SparkSession Extensions
105 | 
106 | **create_df()**
107 | 
108 | ```python
109 | spark.create_df(
110 |     [("jose", "a"), ("li", "b"), ("sam", "c")],
111 |     [("name", StringType(), True), ("blah", StringType(), True)]
112 | )
113 | ```
114 | 
115 | Creates DataFrame with a syntax that's less verbose than the built-in `createDataFrame` method.
116 | 
117 | ### DataFrame Extensions
118 | 
119 | **applyTransform()**
120 | 
121 | ```python
122 | source_df\
123 |     .applyTransform(lambda df: with_greeting(df))\
124 |     .applyTransform(lambda df: with_something(df, "crazy"))
125 | ```
126 | 
127 | Allows for multiple DataFrame transformations to be run and executed.
128 | 
129 | ## Helper Functions
130 | 
131 | ```python
132 | 
133 | import etl
134 | ```
135 | 
136 | ### DataFrame Validations
137 | 
138 | **validatePresenceOfColumns()**
139 | 
140 | ```python
141 | etl.meta.validatePresenceOfColumns(source_df, ["name", "age", "fun"])
142 | ```
143 | 
144 | Raises an exception unless `source_df` contains the `name`, `age`, and `fun` column.
145 | 
146 | **validateSchema()**
147 | 
148 | ```python
149 | etl.meta.validateSchema(source_df, required_schema)
150 | ```
151 | 
152 | Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`.
153 | 
154 | **validateAbsenseOfColumns()**
155 | 
156 | ```python
157 | etl.meta.validateAbsenseOfColumns(source_df, ["age", "cool"])
158 | ```
159 | 
160 | Raises an exception if `source_df` contains `age` or `cool` columns.
161 | 
162 | ### Functions
163 | 
164 | **single_space()**
165 | 
166 | ```python
167 | actual_df = source_df.withColumn(
168 |     "words_single_spaced",
169 |     etl.meta.single_space(col("words"))
170 | )
171 | ```
172 | 
173 | 
174 | Replaces all multispaces with single spaces (e.g. changes `"this has   some"` to `"this has some"`.
175 | 
176 | **remove_all_whitespace()**
177 | 
178 | ```python
179 | actual_df = source_df.withColumn(
180 |     "words_without_whitespace",
181 |     etl.meta.remove_all_whitespace(col("words"))
182 | )
183 | ```
184 | 
185 | Removes all whitespace in a string (e.g. changes `"this has some"` to `"thishassome"`.
186 | 
187 | **anti_trim()**
188 | 
189 | ```python
190 | actual_df = source_df.withColumn(
191 |     "words_anti_trimmed",
192 |     etl.meta.anti_trim(col("words"))
193 | )
194 | ```
195 | 
196 | Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes `" this has some "` to `" thishassome "`.
197 | 
198 | **remove_non_word_characters()**
199 | 
200 | ```python
201 | actual_df = source_df.withColumn(
202 |     "words_without_nonword_chars",
203 |     etl.meta.remove_non_word_characters(col("words"))
204 | )
205 | ```
206 | 
207 | Removes all non-word characters from a string (e.g. changes `"si%$#@!#$!@#mpsons"` to `"simpsons"`.
208 | 
209 | **exists()**
210 | 
211 | ```python
212 | source_df.withColumn(
213 |     "any_num_greater_than_5",
214 |     etl.meta.exists(lambda n: n > 5)(col("nums"))
215 | )
216 | ```
217 | 
218 | `nums` contains lists of numbers and `exists()` returns `True` if any of the numbers in the list are greater than 5.  It's similar to the Python `any` function.
219 | 
220 | **forall()**
221 | 
222 | ```python
223 | source_df.withColumn(
224 |     "all_nums_greater_than_3",
225 |     etl.meta.forall(lambda n: n > 3)(col("nums"))
226 | )
227 | ```
228 | 
229 | `nums` contains lists of numbers and `forall()` returns `True` if all of the numbers in the list are greater than 3.  It's similar to the Python `all` function.
230 | 
231 | **multi_equals()**
232 | 
233 | ```python
234 | source_df.withColumn(
235 |     "are_s1_and_s2_cat",
236 |     etl.meta.multi_equals("cat")(col("s1"), col("s2"))
237 | )
238 | ```
239 | 
240 | `multi_equals` returns true if `s1` and `s2` are both equal to `"cat"`.
241 | 
242 | ### Transformations
243 | 
244 | **snakeCaseColumnNames()**
245 | 
246 | ```python
247 | etl.meta.snakeCaseColumnNames(source_df)
248 | ```
249 | 
250 | Converts all the column names in a DataFrame to snake_case.  It's annoying to write SQL queries when columns aren't snake cased.
251 | 
252 | **sort_columns()**
253 | 
254 | ```python
255 | etl.meta.sort_columns(source_df, "asc")
256 | ```
257 | 
258 | Sorts the DataFrame columns in alphabetical order.  Wide DataFrames are easier to navigate when they're sorted alphabetically.
259 | 
260 | ### DataFrame Helpers
261 | 
262 | **columnToList()**
263 | 
264 | ```python
265 | etl.meta.columnToList(source_df, "name")
266 | ```
267 | 
268 | Converts a column in a DataFrame to a list of values.
269 | 
270 | **twoColumns2Dictionary()**
271 | 
272 | ```python
273 | etl.meta.twoColumns2Dictionary(source_df, "name", "age")
274 | ```
275 | 
276 | Converts two columns of a DataFrame into a dictionary.  In this example, `name` is the key and `age` is the value.
277 | 
278 | **toListOfDictionaries()**
279 | 
280 | ```python
281 | etl.meta.toListOfDictionaries(source_df)
282 | ```
283 | Converts an entire DataFrame into a list of dictionaries.
284 | 


--------------------------------------------------------------------------------
/docs/PysparkLocalSetup.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/PysparkLocalSetup.docx


--------------------------------------------------------------------------------
/docs/SETUP.MD:
--------------------------------------------------------------------------------
  1 | # Datalake ETL Pipeline
  2 | Data transformation simplified for any Data platform.
  3 | 
  4 | `Features:` The package has complete ETL process - 
  5 | 1. Uses metadata, transformation & data model information to design ETL pipeline
  6 | 2. Builds target transformation using both SparkSQL and Spark Dataframes
  7 |    \- Developer to choose option
  8 | 3. Builds source & target Hive DDLs
  9 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions.
 10 | 5. Supports below fundamental transformations for ETL pipeline -
 11 |    * Filters on source & target dataframes
 12 |    * Grouping and Aggregations on source & target dataframes
 13 |    * Heavily nested queries / dataframes
 14 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
 15 |    level of nesting
 16 | 7. Has Unit test cases designed on function/method level & measures
 17 |   source code coverage
 18 | 8. Has information about delpoying to higher environments
 19 | 9. Has API documentation for customization & enhancement
 20 | 
 21 | `Enhancements:` In progress -
 22 | 1. Integrate Audit and logging - Define Error codes, log process failures, Audit progress & runtime information
 23 | 
 24 | ## Setup for Python and Pyspark on Windows & Linux machines locally
 25 | 
 26 | First please install 64 bit JDK 1.8 or 8 for your operating system from Oracle https://www.oracle.com/java/technologies/javase-jdk8-downloads.html
 27 | set below below environment variable pointing to home directory of java (Please remember the path mentioned below may be different for you)
 28 | ```
 29 | JAVA_HOME=C:\Program Files\Java\jdk1.8.0_231
 30 | ```
 31 | 
 32 | Next, Install PyCharm community edition from https://www.jetbrains.com/pycharm/download/
 33 | 
 34 | 
 35 | Make sure you have Python >= 3.0 and Do not use Python 3.8; Recommended is Python 3.7 and PySpark 2.3.4
 36 | If you do not have pip tool, get it from https://pypi.org/project/pip/ and execute below command
 37 | **If you already have Python & pip installed skip the below step** 
 38 | ```
 39 | python get-pip.py
 40 | ```
 41 | 
 42 | Once you have Python and pip follow steps below -
 43 | 1. Install below libraries for virtual environment (Linux)
 44 |    ```
 45 |    pip install virtualenvwrapper
 46 |    ```
 47 |    Windows -
 48 |    ```
 49 |    pip install virtualenvwrapper-win
 50 |    ```   
 51 |    Set up the working directory for virtual environments (Linux)
 52 |    ```
 53 |    export WORKON_HOME=~/Envs
 54 |    mkdir -p $WORKON_HOME
 55 |    source /usr/local/bin/virtualenvwrapper.sh
 56 |    ```
 57 |    Windows
 58 |    ```
 59 |    Create directory C:\Users\<username>\Envs
 60 |    WORKON_HOME=C:\Users\<username>\Envs
 61 |    ```   
 62 | 
 63 | 2. Setup below **ENVIRONMENT VARIABLES**
 64 |     
 65 |     **Unix / Linux / Mac**
 66 |     
 67 |     `Please note: Your computer path may vary, use your computer path in below given in example `
 68 |     - **PYTHONPATH** - Full path to python executable
 69 |         ```
 70 |         export PYTHONPATH=/usr/python37
 71 |         ```
 72 |     - **PATH** - Update PATH variable add PYTHONPATH
 73 |         ```
 74 |         export PATH=$PATH%:$PYTHONPATH
 75 |         ```
 76 |     - **VIRTUALENV_PYTHON** - To create virtual environments. Path is same as PYTHONPATH
 77 |         ```
 78 |         export VIRTUALENV_PYTHON=$PYTHONPATH
 79 |         ```
 80 |     - **VIRTUALENVWRAPPER_VIRTUALENV** - Wrapper for Virtual Environment tools. Path is Scripts folder under PYTHONPATH 
 81 |         ```
 82 |         VIRTUALENVWRAPPER_VIRTUALENV=$PYTHONPATH/Scripts
 83 |         ```
 84 |     **Windows**
 85 |     
 86 |     `Please note: Your computer path may vary, use your computer path in below given in example `
 87 |     - **PYTHONPATH** - Full path to python.exe
 88 |         ```
 89 |         PYTHONPATH=C:\Program Files\Python37
 90 |         ```
 91 |     - **PATH** - Update PATH variable add PYTHONPATH
 92 |         ```
 93 |         PATH=%PATH%;%PYTHONPATH%
 94 |         ```
 95 |     - **VIRTUALENV_PYTHON** - To create virtual environments. Path is same as PYTHONPATH
 96 |         ```
 97 |         VIRTUALENV_PYTHON=%PYTHONPATH%
 98 |         ```
 99 |     - **VIRTUALENVWRAPPER_VIRTUALENV** - Wrapper for Virtual Environment tools. Path is Scripts folder under PYTHONPATH 
100 |         ```
101 |         VIRTUALENVWRAPPER_VIRTUALENV=%PYTHONPATH%\Scripts
102 | 
103 |         ```
104 | 4. Install Hadoop Binaries for Windows: winutils.exe; Note this is only for Windows. Linux users Download Spark libraries for Linux -
105 |    - Download binaries from URL https://github.com/steveloughran/winutils
106 |    - Unzip and place in some directory: For Example I'm using C:\winutils-master
107 |    - Within extracted folder we have hadoop-2.7.1 --> Example: C:\winutils-master\hadoop-2.7.1
108 |    - Declare an Environment variable HADOOP_HOME = C:\winutils-master\hadoop-2.7.1 and update PATH Variable
109 |     ```
110 |     HADOOP_HOME=C:\winutils-master\hadoop-2.7.1
111 |     PATH=%PATH%;%HADOOP_HOME%\bin
112 |     ```
113 | 5. Check if all the Environment Variables are working -
114 |    - Open Command prompt
115 |    - Type `python` it must open Python 3.7.x
116 |    - Type `winutils`, `hadoop`, `hdfs` it must give help instructions of HDFS
117 | 
118 | 3. Create a spark project - 
119 |    - Create a directory for example `datalake-etl-pipeline` anywhere in your computer; Here I'm using `/home/<username>` and create a new virtual environment using commands below -
120 |    ```
121 |    mkvirtualenv -a <full-path-to-im-learning-spark> -p <full-path-to-python.exe> py37
122 |    workon py37
123 |    cdproject
124 |    ```
125 |    `cdproject` command will switch to `datalake-etl-pipeline` folder
126 |    
127 |    Import required libraries from requirements.txt from `datalake-etl-pipeline` root folder, use command below – 
128 |    ```
129 |    pip install -r requirements.txt
130 |    ```
131 |    To Freeze the existing installed packages use command below -
132 |    ```
133 |    pip freeze > requirements.txt
134 |    ```
135 | # ToDo - Yet to add instructions for deploying into higher environments
136 | ## Delpoy to higher environment


--------------------------------------------------------------------------------
/docs/images/DataQualityUML.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/DataQualityUML.png


--------------------------------------------------------------------------------
/docs/images/XMLParse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/XMLParse.png


--------------------------------------------------------------------------------
/docs/images/dq-task1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/dq-task1.png


--------------------------------------------------------------------------------
/docs/images/dq-task2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/dq-task2.png


--------------------------------------------------------------------------------
/docs/images/task1_ouput_er.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/task1_ouput_er.png


--------------------------------------------------------------------------------
/docs/images/task2_ouput_er.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/task2_ouput_er.png


--------------------------------------------------------------------------------
/docs/setup.html:
--------------------------------------------------------------------------------
  1 | <html>
  2 | <head>
  3 |     <meta charset="UTF-8">
  4 |     <link href="file:/C:/Users/vimirji/.markdownNavigator/multimarkdown_layout.css" rel="stylesheet">
  5 |     <style>
  6 | 
  7 | body.multimarkdown-preview,
  8 | body.multimarkdown-wiki-preview {
  9 |     font-size: 11px;
 10 | }
 11 | 
 12 | 
 13 | 
 14 | 
 15 |     </style>
 16 |     <link href="file:/C:/Users/vimirji/.markdownNavigator/multimarkdown_default.css" rel="stylesheet">
 17 | </head>
 18 | <body class="multimarkdown-preview">
 19 | <div class="content">
 20 |     <div class="page-header">SETUP.MD</div>
 21 |     <div class="hr"></div>
 22 |     <h1 id="Vitthal-data-transformation" md-pos="2-30"><a href="#Vitthal-data-transformation"
 23 |                                                           name="Vitthal-data-transformation">Vitthal Data
 24 |         Transformation</a></h1>
 25 |     <p md-pos="31-85">Data transformation simplified for any Data platform.</p>
 26 |     <p md-pos="86-138"><code md-pos="87-96">Features:</code> The package has complete ETL process -</p>
 27 |     <ol>
 28 |         <li md-pos="138-219">Uses metadata, transformation &amp; data model information to design ETL pipeline</li>
 29 |         <li md-pos="219-281">Builds target transformation SparkSQL and Spark Dataframes</li>
 30 |         <li md-pos="281-317">Builds source &amp; target Hive DDLs</li>
 31 |         <li md-pos="317-431">Validates DataFrames, extends core classes, defines DataFrame transformations, and provides
 32 |             UDF SQL functions.
 33 |         </li>
 34 |         <li md-pos="431-641">Supports below fundamental transformations for ETL pipeline -
 35 |             <ul>
 36 |                 <li md-pos="499-539">Filters on source &amp; target dataframes</li>
 37 |                 <li md-pos="542-600">Grouping and Aggregations on source &amp; target dataframes</li>
 38 |                 <li md-pos="603-641">Heavily nested queries / dataframes</li>
 39 |             </ul>
 40 |         </li>
 41 |         <li md-pos="641-734">Has complex and heavily nested XML, JSON, Parquet &amp; ORC parser to nth
 42 |             level of nesting
 43 |         </li>
 44 |         <li md-pos="734-825">Has Unit test cases designed on function/method level &amp; measures
 45 |             source code coverage
 46 |         </li>
 47 |         <li md-pos="825-883">Has information about delpoying to higher environments</li>
 48 |         <li md-pos="883-940">Has API documentation for customization &amp; enhancement</li>
 49 |     </ol>
 50 |     <p md-pos="941-971"><code md-pos="942-955">Enhancements:</code> In progress -</p>
 51 |     <ol>
 52 |         <li md-pos="971-1083">Integrate Audit and logging - Define Error codes, log process failures, Audit progress
 53 |             &amp; runtime information
 54 |         </li>
 55 |     </ol>
 56 |     <h2 id="setup" md-pos="1087-1092"><a href="#setup" name="setup">Setup</a></h2>
 57 |     <p md-pos="1093-1292">Make sure you have Python &gt;= 3.0.
 58 |         If you do not have pip tool, get it from https://pypi.org/project/pip/ and execute below command
 59 |         <strong md-pos="1226-1288">If you already have Python &amp; pip installed skip the below step</strong></p>
 60 |     <pre md-pos="1292-1318"><code md-pos="1296-1314">python get-pip.py
 61 | </code></pre>
 62 |     <p md-pos="1319-1369">Once you have Python and pip follow steps below -</p>
 63 |     <ol>
 64 |         <li class="p" md-pos="1369-1862">
 65 |             <p class="p" md-pos="1372-1428">Install below libraries for virtual environment (Linux)</p>
 66 |             <pre md-pos="1431-1475"><code md-pos="1438-1468">pip install virtualenvwrapper
 67 | </code></pre>
 68 |             <p md-pos="1478-1488">Windows -</p>
 69 |             <pre md-pos="1491-1542"><code md-pos="1498-1532">pip install virtualenvwrapper-win
 70 | </code></pre>
 71 |             <p md-pos="1545-1607">Set up the working directory for virtual environments (Linux)</p>
 72 |             <pre md-pos="1610-1721"><code md-pos="1617-1714">export WORKON_HOME=~/Envs
 73 | mkdir -p $WORKON_HOME
 74 | source /usr/local/bin/virtualenvwrapper.sh
 75 | </code></pre>
 76 |             <p md-pos="1724-1732">Windows</p>
 77 |             <pre md-pos="1735-1866"><code md-pos="1742-1856">set WORKON_HOME=C:\Users\&lt;username&gt;\.Envs
 78 | mkdir -p %WORKON_HOME%
 79 | source /usr/local/bin/virtualenvwrapper.sh
 80 | </code></pre>
 81 |         </li>
 82 |         <li class="p" md-pos="1866-3335">
 83 |             <p class="p" md-pos="1869-1907">Setup below <strong md-pos="1883-1904">ENVIRONMENT VARIABLES</strong></p>
 84 |             <p md-pos="1916-1939"><strong md-pos="1918-1936">Unix / Linux / Mac</strong></p>
 85 |             <p md-pos="1948-2042"><code md-pos="1949-2040">Please note: Your computer path may vary, use your computer
 86 |                 path in below given in example</code></p>
 87 |             <ul>
 88 |                 <li md-pos="2046-2159"><strong md-pos="2050-2060">PYTHONPATH</strong> - Full path to python executable
 89 |                     <pre md-pos="2104-2160"><code md-pos="2116-2148">export PYTHONPATH=/usr/python37
 90 | </code></pre>
 91 |                 </li>
 92 |                 <li md-pos="2164-2275"><strong md-pos="2168-2172">PATH</strong> - Update PATH variable add PYTHONPATH
 93 |                     <pre md-pos="2221-2276"><code md-pos="2233-2264">export PATH=$PATH%:$PYTHONPATH
 94 | </code></pre>
 95 |                 </li>
 96 |                 <li md-pos="2280-2433"><strong md-pos="2284-2301">VIRTUALENV_PYTHON</strong> - To create virtual
 97 |                     environments. Path is same as PYTHONPATH
 98 |                     <pre md-pos="2373-2434"><code md-pos="2385-2422">export VIRTUALENV_PYTHON=$PYTHONPATH
 99 | </code></pre>
100 |                 </li>
101 |                 <li md-pos="2438-2635"><strong md-pos="2442-2470">VIRTUALENVWRAPPER_VIRTUALENV</strong> - Wrapper for
102 |                     Virtual Environment tools. Path is Scripts folder under PYTHONPATH
103 |                     <pre md-pos="2563-2636"><code md-pos="2575-2624">VIRTUALENVWRAPPER_VIRTUALENV=$PYTHONPATH/Scripts
104 | </code></pre>
105 |                 </li>
106 |             </ul>
107 |             <p md-pos="2640-2652"><strong md-pos="2642-2649">Windows</strong></p>
108 |             <p md-pos="2661-2755"><code md-pos="2662-2753">Please note: Your computer path may vary, use your computer
109 |                 path in below given in example</code></p>
110 |             <ul>
111 |                 <li md-pos="2759-2870"><strong md-pos="2763-2773">PYTHONPATH</strong> - Full path to python.exe
112 |                     <pre md-pos="2810-2871"><code md-pos="2822-2859">PYTHONPATH=C:\Program Files\Python37
113 | </code></pre>
114 |                 </li>
115 |                 <li md-pos="2875-2980"><strong md-pos="2879-2883">PATH</strong> - Update PATH variable add PYTHONPATH
116 |                     <pre md-pos="2932-2981"><code md-pos="2944-2969">PATH=%PATH%;%PYTHONPATH%
117 | </code></pre>
118 |                 </li>
119 |                 <li md-pos="2985-3132"><strong md-pos="2989-3006">VIRTUALENV_PYTHON</strong> - To create virtual
120 |                     environments. Path is same as PYTHONPATH
121 |                     <pre md-pos="3078-3133"><code md-pos="3090-3121">VIRTUALENV_PYTHON=%PYTHONPATH%
122 | </code></pre>
123 |                 </li>
124 |                 <li md-pos="3137-3335"><strong md-pos="3141-3169">VIRTUALENVWRAPPER_VIRTUALENV</strong> - Wrapper for
125 |                     Virtual Environment tools. Path is Scripts folder under PYTHONPATH
126 |                     <pre md-pos="3262-3336"><code md-pos="3274-3324">VIRTUALENVWRAPPER_VIRTUALENV=%PYTHONPATH%\Scripts
127 | </code></pre>
128 |                 </li>
129 |             </ul>
130 |         </li>
131 |         <li class="p" md-pos="3336-3817">
132 |             <p class="p" md-pos="3339-3472">Copy the Vitthal-datalake folder to your home folder <code
133 |                     md-pos="3394-3410">/home/&lt;username&gt;</code> and create a new virtual environment using commands
134 |                 below -</p>
135 |             <pre md-pos="3475-3594"><code md-pos="3482-3587">mkvirtualenv -a &lt;path-to-marriot-datalake&gt; -p &lt;full-path-to-python.exe&gt; py37
136 | workon py37
137 | cdproject
138 | </code></pre>
139 |             <p md-pos="3597-3657"><code md-pos="3598-3607">cdproject</code> command will switch to Vitthal-datalake
140 |                 folder</p>
141 |             <p md-pos="3664-3769">Import required libraries from requirements.txt from Vitthal-datalake root folder,
142 |                 use command below –</p>
143 |             <pre md-pos="3772-3821"><code md-pos="3779-3811">pip install -r requirements.txt
144 | </code></pre>
145 |         </li>
146 |     </ol>
147 |     <h2 id="delpoy-to-higher-environment" md-pos="3825-3853"><a href="#delpoy-to-higher-environment"
148 |                                                                 name="delpoy-to-higher-environment">Delpoy to higher
149 |         environment</a></h2>
150 | </div>
151 | </body>
152 | </html>


--------------------------------------------------------------------------------
/logs/bash/logs:
--------------------------------------------------------------------------------
 1 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [0m55729 - Executing create_python_venv.sh on m-c02f6224md6n with arguments
 2 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [93mWARN 55729 - Checking if python3 is installed on machine[0m
 3 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - /Library/Frameworks/Python.framework/Versions/3.9/bin/python3
 4 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [0m55729 - python3 already installed on machine
 5 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [93mWARN 55729 - Checking if pip tool is installed on machine or attempt to download from internet & install[0m
 6 | 2023-04-18T18:06:36+05:30: EXECUTION-LOG - pip 21.1.3 from /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pip (python 3.9)
 7 | 2023-04-18T18:06:36+05:30: EXECUTION-LOG - [0m55729 - pip tool already available on machine
 8 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Defaulting to user installation because normal site-packages is not writeable
 9 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenvwrapper in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (4.8.4)
10 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenv in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (20.4.7)
11 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenv-clone in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (0.5.4)
12 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: stevedore in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (3.3.0)
13 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: pbr!=2.1.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from stevedore->virtualenvwrapper) (5.6.0)
14 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: six<2,>=1.9.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (1.16.0)
15 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: appdirs<2,>=1.4.3 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (1.4.4)
16 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: distlib<1,>=0.3.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (0.3.2)
17 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: filelock<4,>=3.0.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (3.0.12)
18 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - [33mDEBUG 55729 - Using below environment variables & path for virtual environment test[0m
19 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - [33mDEBUG 55729 - VIRTUALENVWRAPPER_PYTHON=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3[0m
20 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - [33mDEBUG 55729 - WORKON_HOME=~/python_venvs/[0m
21 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - created virtual environment CPython3.9.5.final.0-64 in 1454ms
22 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - creator CPython3Posix(dest=/Users/v0m02sj/python_venvs/test, clear=False, no_vcs_ignore=False, global=False)
23 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/Users/v0m02sj/Library/Application Support/virtualenv)
24 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - added seed packages: pip==21.1.2, setuptools==57.0.0, wheel==0.36.2
25 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator
26 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - Setting project for test to /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test
27 | 2023-04-18T18:06:46+05:30: EXECUTION-LOG - Removing test...
28 | 2023-04-18T18:06:47+05:30: EXECUTION-LOG - [0m55729 - Process finished successfully, logs can be found at /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/scripts/bin/../logs/log-python-venv-setup-test-2023-04-18-18.06.35.log
29 | 


--------------------------------------------------------------------------------
/logs/python/log-sample:
--------------------------------------------------------------------------------
1 | 2023-04-19 01:32:44,837 root            INFO     63460      init_logging                   Logging initiated; appending logs to /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/logs/python/log-Hello-World_2023-04-19T01-32-44-810431.log
2 | 2023-04-19 01:32:44,837 root            WARNING  63460      main                           HelloFresh Recipes Data Engineering
3 | 2023-04-19 01:32:44,838 root            WARNING  63460      get_or_create_spark_session    Creating spark session first time with configs [{'key': 'spark.app.name', 'value': ''}]
4 | 2023-04-19 01:32:44,838 root            INFO     63460      read_data_as_spark_dataframe   Attempting to read json in spark using configs {'encoding': 'UTF-8'} from location /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/resources/input
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | psycopg2-binary
 2 | psycopg2
 3 | botocore
 4 | boto3
 5 | boto
 6 | awscli
 7 | dos2unix
 8 | lxml
 9 | mock
10 | moto
11 | urllib3
12 | tqdm
13 | xmlschema
14 | xlrd
15 | awsglue-local
16 | flask
17 | flask_cors
18 | coverage
19 | isodate
20 | py4j
21 | pyspark
22 | pyspark-stubs
23 | pytz
24 | tabulate


--------------------------------------------------------------------------------
/resources/data-quality-reports/recipe-tasks/task1-dq-report.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <body><P>Team,<br/><br/>Data Quality check finished successfully for <b>DQ ID = 101</b>, with failures. Check details in
 3 |     below table of metrics.</P>
 4 | <h3 style="font-family:arial">Failed DQ details</h3>
 5 | <table border="3" style="width:100%">
 6 |     <tr style="text-align:left;background-color:#FF6347">
 7 |         <th>Yarn Application Id</th>
 8 |         <th>DQ ID</th>
 9 |         <th>Rule ID</th>
10 |         <th>Rule Name</th>
11 |         <th>Rule type</th>
12 |         <th>Description</th>
13 |         <th>Columns/Query</th>
14 |         <th>Pass Count</th>
15 |         <th>Fail Count</th>
16 |         <th>Total Count</th>
17 |     </tr>
18 |     <tr>
19 |         <td>local-1682280549403</td>
20 |         <td>101</td>
21 |         <td>1011</td>
22 |         <td>Primary / Natural Keys</td>
23 |         <td>unique</td>
24 |         <td>Primary / Natural Keys should not have duplicates</td>
25 |         <td>['name']</td>
26 |         <td>1039</td>
27 |         <td>3</td>
28 |         <td>1042</td>
29 |     </tr>
30 |     <tr>
31 |         <td>local-1682280549403</td>
32 |         <td>101</td>
33 |         <td>1012</td>
34 |         <td>NOT NULL fields</td>
35 |         <td>not null</td>
36 |         <td>Field should have valid value</td>
37 |         <td>['name', 'cookTime', 'prepTime']</td>
38 |         <td>715</td>
39 |         <td>327</td>
40 |         <td>1042</td>
41 |     </tr>
42 |     <tr>
43 |         <td>local-1682280549403</td>
44 |         <td>101</td>
45 |         <td>1014</td>
46 |         <td>"Check for invalid cook & prep time</td>
47 |         <td>query</td>
48 |         <td>Check empty or null values</td>
49 |         <td>None</td>
50 |         <td>716</td>
51 |         <td>326</td>
52 |         <td>1042</td>
53 |     </tr>
54 | </table>
55 | <h3 style="font-family:arial">Succeeded DQ details</h3>
56 | <table border="3" style="width:100%">
57 |     <tr style="text-align:left;background-color:#33FFBD">
58 |         <th>Yarn Application Id</th>
59 |         <th>DQ ID</th>
60 |         <th>Rule ID</th>
61 |         <th>Rule Name</th>
62 |         <th>Rule type</th>
63 |         <th>Description</th>
64 |         <th>Columns/Query</th>
65 |         <th>Pass Count</th>
66 |         <th>Fail Count</th>
67 |         <th>Total Count</th>
68 |     </tr>
69 |     <tr>
70 |         <td>local-1682280549403</td>
71 |         <td>101</td>
72 |         <td>1013</td>
73 |         <td>Input files check</td>
74 |         <td>query</td>
75 |         <td>Check If all input files are read for processing</td>
76 |         <td>None</td>
77 |         <td>1042</td>
78 |         <td>0</td>
79 |         <td>1042</td>
80 |     </tr>
81 | </table>
82 | <br/><br/>Executed on 2023-04-24 01:39:19,<br/>Thanks
83 | </body>
84 | </html><br/>


--------------------------------------------------------------------------------
/resources/data-quality-reports/recipe-tasks/task2-dq-report.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <body><P>Team,<br/><br/>Data Quality check finished successfully for <b>DQ ID = 101</b>. Check details in below table of
 3 |     metrics.</P>
 4 | <h3 style="font-family:arial">Succeeded DQ details</h3>
 5 | <table border="3" style="width:100%">
 6 |     <tr style="text-align:left;background-color:#33FFBD">
 7 |         <th>Yarn Application Id</th>
 8 |         <th>DQ ID</th>
 9 |         <th>Rule ID</th>
10 |         <th>Rule Name</th>
11 |         <th>Rule type</th>
12 |         <th>Description</th>
13 |         <th>Columns/Query</th>
14 |         <th>Pass Count</th>
15 |         <th>Fail Count</th>
16 |         <th>Total Count</th>
17 |     </tr>
18 |     <tr>
19 |         <td>local-1682280549403</td>
20 |         <td>101</td>
21 |         <td>1015</td>
22 |         <td>Primary / Natural Keys</td>
23 |         <td>unique</td>
24 |         <td>Primary / Natural Keys should not have duplicates</td>
25 |         <td>['difficulty']</td>
26 |         <td>3</td>
27 |         <td>0</td>
28 |         <td>3</td>
29 |     </tr>
30 |     <tr>
31 |         <td>local-1682280549403</td>
32 |         <td>101</td>
33 |         <td>1016</td>
34 |         <td>NOT NULL fields</td>
35 |         <td>not null</td>
36 |         <td>Field should have valid value</td>
37 |         <td>['difficulty', 'avg_total_cooking_time']</td>
38 |         <td>3</td>
39 |         <td>0</td>
40 |         <td>3</td>
41 |     </tr>
42 | </table>
43 | <br/><br/>Executed on 2023-04-24 01:39:24,<br/>Thanks
44 | </body>
45 | </html><br/>


--------------------------------------------------------------------------------
/resources/data/clinical_trial/data/chunk1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/clinical_trial/data/chunk1.zip


--------------------------------------------------------------------------------
/resources/data/clinical_trial/job_parameters/clinical_trial.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "clinical_trial_etl": {
 3 |     "bucket": "dev",
 4 |     "landing_directory": "data/raw/clinical_trial_landing",
 5 |     "staging_directory": "data/raw/clinical_trial_staging",
 6 |     "download_url_prefix_test": "https://github.com/vim89/datalake-etl-pipeline/raw/master/src/resources/clinical_trial/data/chunk",
 7 |     "download_url_prefix": "https://clinicaltrials.gov/ct2/download_studies?down_chunk=",
 8 |     "max_chunk_range": 2,
 9 |     "download_target_filename": "clinical_studies.zip",
10 |     "xml_closing_tag": "clinical_study",
11 |     "xml_root_tag": "clinical_study",
12 |     "xml_row_tag": "clinical_study",
13 |     "xml_attribute_tag": "xml_attribute_value",
14 |     "xml_attribute_prefix": "xmlattribute_",
15 |     "xml_value_tag": "xml_value_tag",
16 |     "audit_columns_definition": [
17 |       "reverse(split(input_file_name(), '/'))[0] AS xml_file_name",
18 |       "CAST('{ts}' AS TIMESTAMP) AS spark_timestamp"
19 |     ],
20 |     "audit_columns": [
21 |       "xml_file_name",
22 |       "spark_timestamp"
23 |     ],
24 |     "timestamp_column": [
25 |       "spark_timestamp"
26 |     ],
27 |     "primary_keys": [
28 |       "id_info.nct_id",
29 |       "xml_file_name"
30 |     ],
31 |     "primary_keys_cascade_to_leaf_level_with_alias": [
32 |       "id_info.nct_id AS pk_nct_id",
33 |       "spark_timestamp AS spark_ts"
34 |     ],
35 |     "order_by_keys": [
36 |       "spark_timestamp"
37 |     ],
38 |     "hashcode_column": [
39 |       "hashcode"
40 |     ],
41 |     "target_primary_keys": [
42 |       "nct_id"
43 |     ],
44 |     "hashcode_encryption_type": "md5",
45 |     "cdc_staging_data_write_mode": "append",
46 |     "audit_directory": "audit/",
47 |     "job_name": "clinical_trial_etl"
48 |   }
49 | }


--------------------------------------------------------------------------------
/resources/data/clinical_trial/sql/transformations/sponsors.sql:
--------------------------------------------------------------------------------
 1 | SELECT
 2 | nct_id,
 3 | agency_class,
 4 | lead_or_collaborator,
 5 | name,
 6 | CAST(last_update_timestamp AS TIMESTAMP) AS last_update_timestamp
 7 | FROM
 8 | (
 9 | 	SELECT xmltable_sponsors_lead_sponsor.pk_nct_id AS nct_id, agency_class agency_class, 'lead' AS lead_or_collaborator,
10 | 				 agency AS name, xmltable_sponsors_lead_sponsor.spark_ts AS last_update_timestamp
11 | 	FROM xmltable_sponsors_lead_sponsor
12 | 		LEFT JOIN xmltable_sponsors ON
13 | 		xmltable_sponsors.surrogate_id_xmltable_sponsors = xmltable_sponsors_lead_sponsor.surrogate_id_xmltable_sponsors
14 | 		AND xmltable_sponsors.pk_nct_id = xmltable_sponsors_lead_sponsor.pk_nct_id
15 | 
16 | 	UNION ALL
17 | 
18 | 	SELECT xmltable_sponsors_collaborator.pk_nct_id AS nct_id, agency_class AS agency_class,
19 | 				 'collaborator' AS lead_or_collaborator, agency AS name,
20 | 				 xmltable_sponsors_collaborator.spark_ts AS last_update_timestamp
21 | 	FROM xmltable_sponsors_collaborator
22 | 	LEFT JOIN xmltable_sponsors ON xmltable_sponsors.surrogate_id_xmltable_sponsors = xmltable_sponsors_collaborator.surrogate_id_xmltable_sponsors
23 | 	AND xmltable_sponsors.pk_nct_id = xmltable_sponsors_collaborator.pk_nct_id
24 | ) sponsors
25 | 


--------------------------------------------------------------------------------
/resources/data/config/application_properties.json:
--------------------------------------------------------------------------------
  1 | {
  2 |   "version": 1,
  3 |   "default_settings": {
  4 |     "max_parallel_spark_submit_process": 6,
  5 |     "history_load_interval_in_days": 30
  6 |   },
  7 |   "command_line_args": [
  8 |     {
  9 |       "name": "workflow",
 10 |       "type": "string",
 11 |       "default": "None"
 12 |     },
 13 |     {
 14 |       "name": "startDate",
 15 |       "type": "string",
 16 |       "default": "None"
 17 |     },
 18 |     {
 19 |       "name": "endDate",
 20 |       "type": "string",
 21 |       "default": "None"
 22 |     },
 23 |     {
 24 |       "name": "refreshType",
 25 |       "type": "string",
 26 |       "default": "None"
 27 |     },
 28 |     {
 29 |       "name": "dq_enabled",
 30 |       "type": "string",
 31 |       "default": "N"
 32 |     },
 33 |     {
 34 |       "name": "configFile",
 35 |       "type": "string",
 36 |       "default": "/u/users/svcdvchnlperf/adhoc/config-prod.yml"
 37 |     }
 38 |   ],
 39 |   "spark_submit_options_order": {
 40 |     "spark-submit": {
 41 |       "priority": 0,
 42 |       "required": false,
 43 |       "value": ""
 44 |     },
 45 |     "--master": {
 46 |       "priority": 1,
 47 |       "required": true,
 48 |       "value": "yarn"
 49 |     },
 50 |     "--deploy-mode": {
 51 |       "priority": 2,
 52 |       "required": true,
 53 |       "value": "cluster"
 54 |     },
 55 |     "--executor-cores": {
 56 |       "priority": 3,
 57 |       "required": true,
 58 |       "value": 5
 59 |     },
 60 |     "--executor-memory": {
 61 |       "priority": 4,
 62 |       "required": true,
 63 |       "value": "4g"
 64 |     },
 65 |     "--num-executors": {
 66 |       "priority": 5,
 67 |       "required": true,
 68 |       "value": 20
 69 |     },
 70 |     "--driver-memory": {
 71 |       "priority": 6,
 72 |       "required": true,
 73 |       "value": "6g"
 74 |     },
 75 |     "--name": {
 76 |       "priority": 7,
 77 |       "required": false,
 78 |       "value": "Channel Performance Spark Job"
 79 |     },
 80 |     "--driver-java-options": {
 81 |       "priority": 8,
 82 |       "required": true,
 83 |       "value": "<placeholder>"
 84 |     },
 85 |     "--conf": {
 86 |       "priority": 9,
 87 |       "required": true,
 88 |       "value": "\"spark.executor.memory=4g\""
 89 |     },
 90 |     "--jars": {
 91 |       "priority": 10,
 92 |       "required": true,
 93 |       "value": "\"/u/users/svcdvchnlperf/adhoc/ScalaSparkArchetypeCore-1.9.3-bundled.jar\""
 94 |     },
 95 |     "--files": {
 96 |       "priority": 11,
 97 |       "required": true,
 98 |       "value": "\"/u/users/svcdvchnlperf/adhoc/connections/connection.yaml,/u/users/svcdvchnlperf/adhoc/connections/job.yaml\""
 99 |     },
100 |     "--class": {
101 |       "priority": 12,
102 |       "required": true,
103 |       "value": "com.walmartlabs.channel.perf.WorkflowController <full-jar-path>"
104 |     },
105 |     "--class_arguments": {
106 |       "priority": 13,
107 |       "required": false,
108 |       "value": {
109 |         "workflow": "<placeholder>",
110 |         "dq_enabled": "<placeholder>",
111 |         "startDate": "<placeholder>",
112 |         "endDate": "<placeholder>",
113 |         "refreshType": "<placeholder>",
114 |         "configFile": "<placeholder>"
115 |       }
116 |     }
117 |   },
118 |   "spark_submit_options_filter": [
119 |     "primary_keys",
120 |     "ADHOC_SCHEMA_GCS_BUCKET",
121 |     "STG_SCHEMA_GCS_BUCKET",
122 |     "APP_SCHEMA_GCS_BUCKET",
123 |     "ADHOC_SCHEMA",
124 |     "STG_SCHEMA",
125 |     "APP_SCHEMA",
126 |     "env",
127 |     "enableservices",
128 |     "runmode",
129 |     "srcrcvts",
130 |     "userId"
131 |   ]
132 | }


--------------------------------------------------------------------------------
/resources/data/config/application_properties.yaml:
--------------------------------------------------------------------------------
1 | version: 1
2 | default_settings:
3 |   max_parallel_spark_submit_process: 5
4 |   history_load_interval_in_days: 30


--------------------------------------------------------------------------------
/resources/data/config/logging.yaml:
--------------------------------------------------------------------------------
 1 | version: 1
 2 | objects:
 3 |   queue:
 4 |     class: queue.Queue
 5 |     maxsize: 1000
 6 | formatters:
 7 |   simple:
 8 |     format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
 9 |   detailed:
10 |     format: '%(asctime)s %(name)-15s %(levelname)-8s %(process)-10d %(funcName)-30s %(message)s'
11 | handlers:
12 |   console:
13 |     class: logging.StreamHandler
14 |     level: DEBUG
15 |     formatter: detailed
16 |     stream: ext://sys.stdout
17 |   console_colored:
18 |     class: utils.logging_util.ColoredLogger
19 |     name: 'Colored'
20 |   file:
21 |     class: logging.FileHandler
22 |     level: DEBUG
23 |     encoding: 'utf-8'
24 |     formatter: detailed
25 |     filename: ../../logs/log-data-pipeline_{timestamp_placeholder}.log
26 |     mode: a
27 |   queue:
28 |     class: utils.logging_util.QueueListenerHandler
29 |     level: DEBUG
30 |     handlers:
31 |       - cfg://handlers.console
32 |       - cfg://handlers.file
33 |     queue: cfg://objects.queue
34 | loggers:
35 |   simpleExample:
36 |     level: INFO
37 |     handlers: [console, file, queue]
38 |     propagate: no
39 | root:
40 |   level: DEBUG
41 |   handlers: [console, file]


--------------------------------------------------------------------------------
/resources/data/product.csv:
--------------------------------------------------------------------------------
1 | id,name       ,price
2 | 1 ,Wrist Watch,10
3 | 2 ,Shoes      ,8
4 | 3 ,Tshirt     ,5
5 | 4 ,Jeans      ,7
6 | 5 ,Sunglasses ,7
7 | 


--------------------------------------------------------------------------------
/resources/data/purchase.csv:
--------------------------------------------------------------------------------
1 | id ,productid,purchasedate,storeid
2 | 100,1        ,10/11/2019  ,1000
3 | 101,3        ,10/12/2019  ,1002
4 | 102,2        ,            ,1004
5 | 103,1        ,10/14/2019  ,1004
6 | 104,4        ,10/15/2019  ,1003
7 | 105,4        ,10/16/2019  ,1002
8 | 


--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/.part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/.part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc


--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/_SUCCESS


--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet


--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc    


--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/.part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task2/.part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc


--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task2/_SUCCESS


--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv:
--------------------------------------------------------------------------------
1 | difficulty,avg_total_cooking_time
2 | easy,PT7M5.086705S
3 | hard,PT2H43M37.105263S
4 | medium,PT41M53.288136S
5 | 


--------------------------------------------------------------------------------
/resources/data/store.csv:
--------------------------------------------------------------------------------
1 | id  ,name
2 | 1000,Borivili
3 | 1001,Kandivili
4 | 1002,Andheri
5 | 1003,Bandra
6 | 1004,Dadar
7 | 1005,Byculla
8 | 


--------------------------------------------------------------------------------
/sbin/common_functions.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #********************************************************#
  3 | #        Common bash reusable functions
  4 | #              library_functions.sh
  5 | #                 April 2023
  6 | #********************************************************#
  7 | 
  8 | #*********************************************************
  9 | #                      Comprehensive Logging
 10 | #*********************************************************
 11 | 
 12 | #--------------------------------------------------------------------
 13 | # Prints a log statement
 14 | # Parameter: (message) (level: DEBUG,INFO,ERROR,AUDIT)
 15 | # Returns: N/A
 16 | #--------------------------------------------------------------------
 17 | log(){
 18 | 	local msg=$1
 19 | 	local lvl=$2
 20 | 	if [ -z "${lvl}" ]; then
 21 | 		lvl="INFO"
 22 | 	fi
 23 | 
 24 | 	## 0=default; 31=red; 33=yellow, 93=light yellow; 34=blue
 25 | 	# shellcheck disable=SC2155
 26 | 	# shellcheck disable=SC2034
 27 | 	local lts=$(date +%FT%T.%3N)
 28 | 	case "${lvl}" in
 29 | 		("ERROR")
 30 | 		>&2 echo -e "\e[31m${lvl}" $$ "-" "${msg}\e[0m"
 31 | 		;;
 32 | 		("WARN")
 33 | 		echo -e "\e[93m${lvl}" $$ "-" "${msg}\e[0m"
 34 | 		;;
 35 | 		("AUDIT")
 36 | 		echo -e "\e[34m${lvl}" $$ "-" "${msg}\e[0m"
 37 | 		isCheckRequired=true
 38 | 		;;
 39 | 		("DEBUG")
 40 | 		echo -e "\e[33m${lvl}" $$ "-" "${msg}\e[0m"
 41 | 		# shellcheck disable=SC2034
 42 | 		isCheckRequired=true
 43 | 		;;
 44 | 		(*) echo -e "\e[0m"$$ "-" "${msg}"
 45 | 		return 1
 46 | 		;;
 47 | 	esac
 48 | }
 49 | 
 50 | #--------------------------------------------------------------------
 51 | # Prints an error
 52 | # Parameter: Error message
 53 | # Returns: N/A
 54 | #--------------------------------------------------------------------
 55 | logError(){
 56 | 	log "$1" "ERROR"
 57 | }
 58 | 
 59 | #--------------------------------------------------------------------
 60 | # Prints a warn message
 61 | # Parameter: Error message
 62 | # Returns: N/A
 63 | #--------------------------------------------------------------------
 64 | logWarn(){
 65 | 	log "$1" "WARN"
 66 | }
 67 | 
 68 | #--------------------------------------------------------------------
 69 | # Prints an audit message
 70 | # Parameter: Error message
 71 | # Returns: N/A
 72 | #--------------------------------------------------------------------
 73 | logAudit(){
 74 | 	log "$1" "AUDIT"
 75 | }
 76 | 
 77 | #--------------------------------------------------------------------
 78 | # Prints a debug message
 79 | # Parameter: Error message
 80 | # Returns: N/A
 81 | #--------------------------------------------------------------------
 82 | logDebug(){
 83 | 	log "$1" "DEBUG"
 84 | }
 85 | 
 86 | #--------------------------------------------------------------------
 87 | # Performs cleanup task
 88 | # Parameter: N/A
 89 | # Returns: N/A
 90 | #---------------------------------------------------------------------
 91 | cleanup(){
 92 | 	log "Process finished successfully, logs can be found at ${LOG_FILE}"
 93 | }
 94 | 
 95 | #--------------------------------------------------------------------
 96 | # Called using trap on SIGINT, SIGQUIT, SIGABRT, SIGALRM, SIGTERM
 97 | # Parameter: Error message
 98 | # Returns: N/A
 99 | #---------------------------------------------------------------------
100 | interrupt(){
101 | 	logError "Process got interrupted with exit code $?! Check error logs in ${LOG_FILE}"
102 | 	exit 1
103 | }
104 | 
105 | #--------------------------------------------------------------------
106 | # Displays a loading indicator for background jobs
107 | # Parameter: Subprocess pid
108 | # Returns: N/A
109 | #---------------------------------------------------------------------
110 | loadingIndicator(){
111 | 	local pid=$1
112 |   spin='-\|/'
113 | 
114 |   local i=0
115 |   while kill -0 $pid 2>/dev/null
116 |   do
117 |     i=$(( (i+1) %4 ))
118 |     # shellcheck disable=SC2059
119 |     printf "\r${spin:$i:1}"
120 |     sleep .1
121 |   done
122 | }
123 | 


--------------------------------------------------------------------------------
/sbin/create_python_venv.sh:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env bash
  2 | #********************************************************#
  3 | #         Python Virtual Environment setup
  4 | #              create_python_venv.sh
  5 | #                 April 2023
  6 | #********************************************************#
  7 | 
  8 | #--------------------------------------------------------------------
  9 | # Prints usage of script, non-zero exit in case of incorrect usage
 10 | # Parameter: N/A
 11 | # Returns: N/A
 12 | #--------------------------------------------------------------------
 13 | scriptUsage() {
 14 |   logError "Usage: ${SCRIPT_NAME} [./create_python_venv.sh -n VIRTUAL_ENV_NAME]"
 15 |   logError "Do not use 'sh' shell to run the script; use 'bash' or ./create_python_venv.sh <args>" 1>&2
 16 |   exit 1;
 17 | }
 18 | 
 19 | #-----------------------------------------------------------------------
 20 | # Checks if python3 exists otherwise exit with non-zero
 21 | # Parameter: N/A
 22 | # Returns: 0 if python3 exist else exit with non-zero
 23 | #-----------------------------------------------------------------------
 24 | python3Exists() {
 25 |   logWarn "Checking if python3 is installed on machine"
 26 |   if ! which python3; then
 27 |     logError "python3 is not installed in the machine, please install python3 as base to create virtual environments on top of base python"
 28 |     exit 1;
 29 |   fi
 30 |   log "python3 already installed on machine"
 31 |   return 0;
 32 | }
 33 | 
 34 | #-----------------------------------------------------------------------
 35 | # Checks if pip tool exists otherwise downloads from internet to install
 36 | # Exit with non-zero in case of poor or no internet connection
 37 | # Parameter: N/A
 38 | # Returns: N/A
 39 | #-----------------------------------------------------------------------
 40 | pipExists() {
 41 |   logWarn "Checking if pip tool is installed on machine or attempt to download from internet & install"
 42 |   if ! pip --version; then
 43 |     if ! curl https://bootstrap.pypa.io/get-pip.py --output get-pip.py; then
 44 |       logError "Error downloading file from the internet; check your internet connection & proxy settings"
 45 |       exit 1;
 46 |     else
 47 |       log "Downloaded get-pip.py successfully"
 48 |       if ! python get-pip.py; then
 49 |         logError "Error installing pip, check logs"
 50 |         exit 1;
 51 |       fi
 52 |       log "pip installed successfully, upgrading"
 53 |       python3 -m pip install --upgrade pip
 54 |       return 0
 55 |     fi
 56 |   else
 57 |     log "pip tool already available on machine, upgrading"
 58 |     python3.9 -m pip install --upgrade pip
 59 |     return 0
 60 |   fi
 61 | }
 62 | 
 63 | #--------------------------------------------------------------------
 64 | # Installs virtualenvwrapper
 65 | # Exit with non-zero in case of any error during installation,
 66 | # Parameter: N/A
 67 | # Returns: 0 if installation is successful, non-zero exit otherwise
 68 | #--------------------------------------------------------------------
 69 | installVEnvWrapper() {
 70 |   mkdir -p "$HOME/python_venvs/"
 71 |   if ! pip install virtualenvwrapper; then
 72 |     logError "Error installing virtualenvwrapper using pip; check logs & check your internet connection & proxy"
 73 |     exit 1;
 74 |   fi
 75 |   return 0
 76 | }
 77 | 
 78 | #--------------------------------------------------------------------
 79 | # Creates python virtual environment by given name
 80 | # Exit with non-zero in case of any error during creation,
 81 | # Parameter: virtualEnvName: name of the virtual environment to create
 82 | # Returns: N/A
 83 | #--------------------------------------------------------------------
 84 | createVirtualEnv() {
 85 |   local virtualEnvName
 86 |   virtualEnvName=$(echo "$1" | xargs) #xargs is to trim
 87 |   local python3FullPath
 88 |   python3FullPath=$(which python3)
 89 |   export VIRTUALENVWRAPPER_PYTHON="${python3FullPath}"
 90 |   export WORKON_HOME="$HOME/python_venvs/"
 91 |   export PROJECT_HOME="${HOME_DIRECTORY}/../"
 92 |   logDebug "Using below environment variables & path for virtual environment ${virtualEnvName}"
 93 |   logDebug "VIRTUALENVWRAPPER_PYTHON=${VIRTUALENVWRAPPER_PYTHON}"
 94 |   logDebug "WORKON_HOME=${WORKON_HOME}"
 95 | 
 96 |   source virtualenvwrapper.sh
 97 | 
 98 |   rmvirtualenv "${virtualEnvName}"
 99 | 
100 |   if ! mkvirtualenv -a "${HOME_DIRECTORY}/../" -p "${python3FullPath}" "${virtualEnvName}";then
101 |     logError "Error creating virtual environment ${virtualEnvName}"
102 |     exit 1;
103 |   fi
104 | }
105 | 
106 | #------------------------------------------------------------------------------
107 | # installs required packages for virtual environment given in requirements.txt
108 | # Exit with non-zero in case of any error during installation,
109 | # Parameter: virtualEnvName: name of the virtual environment to create
110 | # Returns: N/A
111 | #------------------------------------------------------------------------------
112 | installRequiredPackages() {
113 |   local virtualEnvName
114 |   virtualEnvName=$(echo "$1" | xargs) #xargs is to trim
115 |   workon "${virtualEnvName}"
116 |   cdproject
117 |   pip install -r requirements.txt
118 |   # pip freeze > requirements.txt
119 |   python3 -m pip install --upgrade pip
120 |   # source activate
121 | }
122 | 
123 | #************************************************************************
124 | #
125 | #                      MAIN SCRIPTS STARTS HERE
126 | #
127 | #************************************************************************
128 | 
129 | # Execute ./create_python_venv.sh -n hello-fresh-data-engg
130 | 
131 | # Read initial variables
132 | HOST_NAME=`hostname`
133 | USER=`whoami`
134 | SCRIPT_NAME=$(basename "$0")
135 | HOME_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
136 | cd "${HOME_DIRECTORY}" || exit # exit in case cd fails; very rare
137 | source "${HOME_DIRECTORY}/common_functions.sh"
138 | 
139 | export SETUPTOOLS_USE_DISTUTILS=stdlib
140 | 
141 | # trap interrupts 0 SIGHUP SIGINT SIGQUIT SIGABRT SIGALRM SIGTERM
142 | trap interrupt 1 2 3 6 14 15
143 | trap cleanup 0
144 | 
145 | while getopts ":n:" arg; do
146 | 	case "${arg}" in
147 |     n)
148 |       VENV_NAME=${OPTARG}
149 |       ;;
150 | 		*)
151 | 			scriptUsage
152 | 			;;
153 | 	esac
154 | done
155 | shift $((OPTIND-1))
156 | 
157 | if [[ -z ${VENV_NAME} ]]; then
158 |   logError "Empty virtual environment name"
159 |   scriptUsage
160 | fi
161 | 
162 | mkdir -p "${HOME_DIRECTORY}/../logs/bash/"
163 | LOG_FILE="${HOME_DIRECTORY}/../logs/bash/log-python-venv-setup-${VENV_NAME}-$(date +%F-%H.%M.%S).log"
164 | # Global log redirect
165 | exec &> >(while read -r line; do printf '%s %s\n' "$(date -Iseconds): EXECUTION-LOG - $line"; done | tee -a "${LOG_FILE}" )
166 | 
167 | log "Executing $SCRIPT_NAME on $HOST_NAME with arguments"
168 | 
169 | if python3Exists && pipExists; then
170 |   installVEnvWrapper
171 |   createVirtualEnv "${VENV_NAME}"
172 |   installRequiredPackages "${VENV_NAME}"
173 | fi
174 | 
175 | exit 0;
176 | 


--------------------------------------------------------------------------------
/sbin/execute-tasks-spark-submit.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | # Read initial variables
 4 | HOME_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
 5 | cd "${HOME_DIRECTORY}" || exit # exit in case cd fails; very rare
 6 | export PYTHONPATH=${PYTHONPATH}:"${HOME_DIRECTORY}/../src/"
 7 | export VIRTUALENVWRAPPER_PYTHON="$(which python3)"
 8 | export WORKON_HOME="$HOME/python_venvs/"
 9 | export PROJECT_HOME="${HOME_DIRECTORY}../"
10 | source virtualenvwrapper.sh
11 | source "$HOME/python_venvs/hello-fresh-data-engg/bin/activate"
12 | 
13 | while getopts ":e:c:m:" arg; do
14 | 	case "${arg}" in
15 |     e)
16 |       NUM_EXECS=${OPTARG}
17 |       ;;
18 |     c)
19 |       EXEC_CORES=${OPTARG}
20 |       ;;
21 |     m)
22 |       EXEC_MEM=${OPTARG}
23 |       ;;
24 | 		*)
25 | 			scriptUsage
26 | 			;;
27 | 	esac
28 | done
29 | shift $((OPTIND-1))
30 | 
31 | if [[ -z ${NUM_EXECS} || -z ${EXEC_CORES} || -z ${EXEC_MEM} ]]; then
32 |   NUM_EXECS="2"
33 |   NUM_CORES="1"
34 |   EXEC_MEM="1g"
35 | fi
36 | 
37 | FILES="${HOME_DIRECTORY}/../conf/data-quality/rules/production_configs/recipe-task1-dq-rules.json,${HOME_DIRECTORY}/../conf/data-quality/rules/production_configs/recipe-task2-dq-rules.json,${HOME_DIRECTORY}/../conf/spark/log4j.properties"
38 | 
39 | spark-submit \
40 | --master local[*]	\
41 | --name "HelloFresh Data Engineering Recipe tasks" \
42 | --driver-memory 1g \
43 | --num-executors "${NUM_EXECS}" \
44 | --executor-cores "${NUM_CORES}" \
45 | --executor-memory "${EXEC_MEM}" \
46 | --conf spark.dynamicAllocation.enabled=false \
47 | --conf spark.yarn.maxAppAttempts=1 \
48 | --conf spark.serializer="org.apache.spark.serializer.KryoSerializer" \
49 | --conf spark.driver.extraJavaOptions="-Dlog4j.configuration=log4j.properties" \
50 | --files "${FILES}" \
51 | ../src/com/vitthalmirji/datapipelines/recipe_tasks.py --input-data-dir "${HOME_DIRECTORY}/../resources/data/input" --output-data-dir "${HOME_DIRECTORY}/../resources/data/output"
52 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | import sys
 3 | 
 4 | from setuptools.command.install import install
 5 | from setuptools import setup, find_packages
 6 | 
 7 | with open('requirements.txt') as f:
 8 |     requirements = f.read().splitlines()
 9 | 
10 | tests_require = ['pytest', 'pytest-cov', 'coverage']
11 | 
12 | with open("docs/ETL_README.md", "r") as f:
13 |     long_description = f.read()
14 | 
15 | 
16 | class ShellInstall(install):
17 |     def run(self):
18 |         if not sys.platform.startswith("linux"):
19 |             print('Your platform {} might not be supported'.format(sys.platform))
20 |         else:
21 |             print('Running create_python_venv.sh -n hello-fresh-data-engg')
22 |         subprocess.call(['./sbin/create_python_venv.sh', '-n', 'hello-fresh-data-engg'])
23 |         install.run(self)
24 | 
25 | 
26 | setup(
27 |     cmdclass={'install': ShellInstall},
28 |     name='datapipelines-essentials',
29 |     version='2.0',
30 |     author='Vitthal Mirji',
31 |     author_email='vitthalmirji@gmail.com',
32 |     url='https://vitthalmirji.com',
33 |     description='Datalake complex transformations simplified in PySpark',
34 |     long_description='Simplified ETL process in Hadoop using Apache Spark. '
35 |                      'SparkSession extensions, DataFrame validation, Column extensions, SQL functions, and DataFrame '
36 |                      'transformations',
37 |     long_description_content_type="text/markdown",
38 |     install_requires=requirements,
39 |     tests_require=tests_require,
40 |     extras_require={
41 |         'test': tests_require,
42 |         'all': requirements + tests_require,
43 |         'docs': ['sphinx'] + tests_require,
44 |         'lint': []
45 |     },
46 |     license="GNU :: GPLv3",
47 |     include_package_data=True,
48 |     packages=find_packages(where='src', include=['com*']),
49 |     package_dir={"": "src"},
50 |     setup_requires=['setuptools'],
51 |     classifiers=[
52 |         "Programming Language :: Python :: 3",
53 |         "License :: GNU :: GPLv3",
54 |         "Operating System :: Linux",
55 |     ],
56 |     dependency_links=[],
57 |     python_requires='>=3.7,<=3.9.5',
58 |     keywords=['apachespark', 'spark', 'pyspark', 'etl', 'hadoop', 'bigdata', 'apache-spark', 'python', 'python3',
59 |               'data', 'dataengineering', 'datapipelines']
60 | )
61 | 


--------------------------------------------------------------------------------
/src/com/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/datapipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datapipelines/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/datapipelines/clinical_trial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datapipelines/clinical_trial/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/datawarehousing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datawarehousing/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/datawarehousing/change_data_capture.py:
--------------------------------------------------------------------------------
 1 | import hashlib
 2 | 
 3 | from pyspark.sql import SparkSession, DataFrame, Window
 4 | from pyspark.sql.functions import col, row_number
 5 | 
 6 | from com.vitthalmirji.utils.Utilities import is_null_or_empty
 7 | 
 8 | 
 9 | def append_audit_attributes_to_xml(file, file_contents, xml_closing_tag):
10 |     hash_val = hashlib.md5(file_contents.encode('utf-8')).hexdigest()
11 |     return str(file_contents).replace(f'</{xml_closing_tag}>',
12 |                                       f'<hashcode>{hash_val}</hashcode'
13 |                                       f'><xml_file_name>'
14 |                                       f'{str(file)}</xml_file_name></'
15 |                                       f'{xml_closing_tag}>')
16 | 
17 | 
18 | def add_row_number_to_dataframe(dataframe: DataFrame, primary_keys, order_by_keys, eliminate_duplicate_records=False,
19 |                                 drop_row_number_column=False):
20 |     window = Window.partitionBy(
21 |         *list(map(lambda c: col(c), primary_keys))).orderBy(
22 |         *list(map(lambda c: col(c).desc(), order_by_keys)))
23 |     row_num_col = row_number().over(window=window).alias('row_num')
24 | 
25 |     if eliminate_duplicate_records and drop_row_number_column:
26 |         return dataframe.withColumn(colName='row_num', col=row_num_col).filter('row_num = 1').drop('row_num')
27 |     elif eliminate_duplicate_records:
28 |         return dataframe.withColumn(colName='row_num', col=row_num_col).filter('row_num = 1')
29 |     else:
30 |         return dataframe.withColumn(colName='row_num', col=row_num_col)
31 | 
32 | 
33 | def add_audit_columns(_df: DataFrame) -> DataFrame:
34 |     import datetime
35 |     ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
36 |     df: DataFrame = _df
37 |     sel_cols = list(map(lambda x: str(f'`{x}`'), df.schema.names))
38 |     sel_cols.append(f"reverse(split(input_file_name(), '/'))[0] AS spark_file_name")
39 |     sel_cols.append(f"CAST('{ts}' AS TIMESTAMP) AS spark_timestamp")
40 |     print(sel_cols)
41 |     df: DataFrame = df.selectExpr(sel_cols)
42 |     return df
43 | 
44 | 
45 | def identify_new_records(spark: SparkSession, old_dataframe: DataFrame, new_dataframe: DataFrame,
46 |                          primary_keys=[], order_by_keys=['current_timestamp']) -> DataFrame:
47 |     old_df = "old_df"
48 |     new_df = "new_df"
49 | 
50 |     if is_null_or_empty(primary_keys):
51 |         print("WARNING - Empty primary keys given: Assuming all fields in the table for Deduplication")
52 |         dedup_query = f"SELECT *FROM (SELECT t1.*,  row_number() over (order by {','.join(order_by_keys)} desc) as row_num FROM {old_df} t1) WHERE row_num = 1"
53 |     elif is_null_or_empty(old_dataframe) and is_null_or_empty(
54 |             new_dataframe) and new_dataframe.count() <= 0 and old_dataframe.count() <= 0:
55 |         print("Empty Dataframes")
56 |         return None
57 |     elif not is_null_or_empty(new_dataframe) and new_dataframe.count() > 0 and (
58 |             is_null_or_empty(old_dataframe) or old_dataframe.count() <= 0):
59 |         print("Assuming initial load CDC not required")
60 |         return new_dataframe
61 |     else:
62 |         print(f"Before CDC Staging count = {old_dataframe.count()}")
63 |         dedup_query = f"SELECT *FROM (SELECT t1.*, row_number() over (partition by {','.join(primary_keys)} order by {','.join(order_by_keys)} desc) as row_num FROM {old_df} t1) WHERE row_num = 1"
64 |         old_dataframe.createOrReplaceTempView(old_df)
65 |         new_dataframe.createOrReplaceTempView(new_df)
66 |         spark.sql(dedup_query).createOrReplaceTempView(old_df)
67 | 
68 |         join_condition = list(map(lambda x: str(f'{old_df}.{x} = {new_df}.{x}'), primary_keys))
69 |         exclude_condition = list(map(lambda x: str(f'{old_df}.{x} IS NULL'), primary_keys))
70 |         new_pks_query = f"SELECT {new_df}.* FROM {new_df} LEFT JOIN {old_df} ON {' AND '.join(join_condition)} WHERE {' AND '.join(exclude_condition)}"
71 |         updates_query = f"SELECT {new_df}.* FROM {new_df} INNER JOIN {old_df} ON {' AND '.join(join_condition)} WHERE {new_df}.hashcode <> {old_df}.hashcode"
72 |         print(f"Fetch only New PK records query = {new_pks_query}")
73 |         print(f"Fetch updated records query = {updates_query}")
74 |         new_pk_records_df: DataFrame = spark.sql(new_pks_query).dropDuplicates()
75 |         updates_df: DataFrame = spark.sql(updates_query).dropDuplicates()
76 | 
77 |         return new_pk_records_df.union(updates_df)
78 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/CColumn.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql.types import *
 2 | 
 3 | from com.vitthalmirji.etl import ETL
 4 | 
 5 | 
 6 | class CColumn:
 7 |     def __init__(self, colname, coldatatype, pk, filterclause, udf="", udfargs=[], casttype="", aliasname=""):
 8 |         self.colname = colname
 9 |         self.coldatatype = coldatatype
10 |         self.pk = pk
11 |         self.udf = udf
12 |         self.udfargs = udfargs
13 |         self.aliasname = aliasname
14 |         self.casttype = casttype
15 |         self.filterclause = filterclause
16 |         self.selectexpression = ""
17 |         self.matchmetatype = {
18 |             'tinyint': IntegerType(),
19 |             'smallint': IntegerType(),
20 |             'int': IntegerType(),
21 |             'bigint': LongType(),
22 |             'long': LongType(),
23 |             'float': FloatType(),
24 |             'double': DoubleType(),
25 |             'boolean': BooleanType(),
26 |             'string': StringType(),
27 |             'date': DateType(),
28 |             'timestamp': TimestampType(),
29 |             'binary': BinaryType()
30 |         }
31 | 
32 |     def applyUdf(self):
33 |         if ETL.isNullOrEmpty(self.udf) is None and len(self.udfargs) is 0:
34 |             # tempcol: pyspark.sql.column.Column = col(str(self.colname))
35 |             # tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname)
36 |             self.selectexpression = f"CAST({self.colname} AS {self.casttype}) AS {self.aliasname},"
37 |         elif ETL.isNullOrEmpty(self.udf) is not None and len(self.udfargs) is 0:
38 |             # tempcol = col(self.colname)
39 |             # kwargs = {'field': tempcol}
40 |             # udfFunc = getattr(ETL, f"udf{str(self.udf).title()}")
41 |             # tempcol = udfFunc(tempcol)
42 |             # tempcol = tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname)
43 |             self.selectexpression = f"CAST({self.udf}({self.colname}) AS {self.casttype}) AS {self.aliasname},"
44 |         elif ETL.isNullOrEmpty(self.udf) is not None and len(self.udfargs) is not 0:
45 |             # tempcol = col(self.colname)
46 |             # udfFunc = getattr(ETL, f"udf{str(self.udf).title()}")
47 |             # tempcol = udfFunc(tempcol)
48 |             # tempcol = tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname)
49 |             self.selectexpression = f"CAST({self.udf}({self.colname}, {','.join(self.udfargs)}) AS {self.casttype}) AS {self.aliasname}"
50 |         else:
51 |             self.selectexpression = f"CAST({self.colname} AS {self.casttype}) AS {self.aliasname},"
52 |         return self.selectexpression
53 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/ETL.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | import pyspark.sql.functions as f
 4 | import pytz
 5 | from pyspark.sql import SparkSession
 6 | from pyspark.sql.types import *
 7 | 
 8 | lookup = {}
 9 | 
10 | 
11 | # ToDo - Yet to add many potential UDFs
12 | 
13 | def registerAllUDF(sc: SparkSession):
14 |     sc.udf.register(name='datetimetogmt', f=datetimeToGMT)
15 |     sc.udf.register(name='zonedatetimetogmt', f=zoneDatetimeToGMTZone)
16 |     sc.udf.register(name='isnullorempty', f=isNullOrEmpty)
17 |     sc.udf.register(name='datetimetogmt', f=datetimeToGMT)
18 |     sc.udf.register(name='udfnvl', f=udfNvl)
19 |     sc.udf.register(name='udflookup', f=udfLookups)
20 | 
21 | 
22 | def datetimeToGMT(dt, fmt):
23 |     local = pytz.timezone("America/Los_Angeles")
24 |     # format = "%Y-%m-%d %H:%M:%S"
25 |     naive = datetime.datetime.strptime(str(dt).strip(), str(fmt).strip())
26 |     local_dt = local.localize(naive, is_dst=None)
27 |     utc_dt = local_dt.astimezone(pytz.utc)
28 |     return utc_dt
29 | 
30 | 
31 | def strSplitSep(s, sep=','):
32 |     return str(s).split(str(sep))
33 | 
34 | 
35 | def varargsToList(*fields, sep):
36 |     return str(sep).join(fields)
37 | 
38 | 
39 | def zoneDatetimeToGMTZone(dt, fmt, zone):
40 |     local = pytz.timezone(str(zone).strip())
41 |     # format = "%Y-%m-%d %H:%M:%S"
42 |     naive = datetime.datetime.strptime(str(dt).strip(), str(fmt).strip())
43 |     local_dt = local.localize(naive, is_dst=None)
44 |     utc_dt = local_dt.astimezone(pytz.utc)
45 |     return utc_dt
46 | 
47 | 
48 | @f.udf(returnType=StringType())
49 | def udfNvl(field):
50 |     if isNullOrEmpty(field) is None:
51 |         return "-"
52 |     else:
53 |         return field
54 | 
55 | 
56 | @f.udf(returnType=StringType())
57 | def udfLookups(clname, s):
58 |     finallookupvalue = []
59 |     if s is None:
60 |         return ""
61 |     else:
62 |         codes = str(s).split(sep=';')
63 |         for cd in codes:
64 |             if f"{clname} {cd}" in lookup.keys():
65 |                 finallookupvalue.append(lookup[f"{clname} {cd}"])
66 |             else:
67 |                 finallookupvalue.append(cd)
68 | 
69 |     return ';'.join(finallookupvalue)
70 | 
71 | 
72 | def squared_udf(s):
73 |     if s is None:
74 |         return None
75 |     return s * s
76 | 
77 | 
78 | def nullString(s):
79 |     return s is None or str(s).strip().__eq__("") is None
80 | 
81 | 
82 | def isNullOrEmpty(s):
83 |     if s is None:
84 |         return None
85 |     if str(s).strip() is None or str(s).strip().__eq__(""):
86 |         return None
87 |     return str(s).strip()
88 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/ETLTransform.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | 
  3 | import pyspark.sql.functions as SparkSQLFunctions
  4 | from pyspark.sql import DataFrame, SparkSession
  5 | 
  6 | from com.vitthalmirji.etl import ETL
  7 | from com.vitthalmirji.etl.ITable import SourceTable, TargetTable, matchEqualityOperator
  8 | from com.vitthalmirji.etl.meta import MetaModel
  9 | from com.vitthalmirji.etl.meta.MetaModel import MetaResult
 10 | 
 11 | 
 12 | # ToDo - Source & Target group aggregations
 13 | 
 14 | class Transform:
 15 |     def __init__(self, targettable, model: MetaModel, sc: SparkSession):
 16 |         self.model = model
 17 |         self.spark = sc
 18 |         self.sourcetables: list[SourceTable] = []
 19 |         self.targettable = targettable
 20 |         self.transformquery = ""
 21 |         self.joindict = {}
 22 |         self.sourcetablesdf: list[DataFrame] = []
 23 |         self.targetdf: DataFrame = None
 24 |         self.targetcolumnslist = []
 25 |         self.joincolumns = None
 26 |         self.jointype = None
 27 | 
 28 |     def genericDfOperation(self, operationFunc):
 29 |         return operationFunc(self)
 30 | 
 31 |     DataFrame.genericDfOperation = genericDfOperation
 32 | 
 33 |     def filterSourceTable(self, srctbl):
 34 |         srctbls = filter(lambda tbl: tbl.tablename == srctbl, self.sourcetables)
 35 |         return list(srctbls)
 36 | 
 37 |     def joinDataframes(self, dict1, dict2):
 38 |         targetdf: DataFrame = dict1['df'].join(dict2['df'], on=dict2['condition'], how=dict2['jointype'])
 39 |         return {'df': targetdf}
 40 | 
 41 |     def mapAggregationFunction(self, fieldname, functionname):
 42 |         if str(functionname).__eq__('min'):
 43 |             return SparkSQLFunctions.min(col=SparkSQLFunctions.col(fieldname))
 44 |         elif str(functionname).__eq__('max'):
 45 |             return SparkSQLFunctions.max(col=SparkSQLFunctions.col(fieldname))
 46 |         elif str(functionname).__eq__('count'):
 47 |             return SparkSQLFunctions.count(col=SparkSQLFunctions.col(fieldname))
 48 |         elif str(functionname).__eq__('sum'):
 49 |             return SparkSQLFunctions.sum(col=SparkSQLFunctions.col(fieldname))
 50 |         elif str(functionname).__eq__('avg'):
 51 |             return SparkSQLFunctions.avg(col=SparkSQLFunctions.col(fieldname))
 52 | 
 53 |     def applyJoin(self):
 54 |         self.query, self.joindict = self.model.joinSQL(self.model.datamodel, 'purchase', 'product', 'store')
 55 | 
 56 |         joinlist = []
 57 |         for k in self.joindict.keys():
 58 |             srctabledf: DataFrame = self.filterSourceTable(k)[0].targetdf
 59 |             self.joindict[k].update({'df': srctabledf})
 60 |             joinlist.append(self.joindict[k])
 61 | 
 62 |         self.targetdf: DataFrame = functools.reduce(self.joinDataframes, joinlist)['df']
 63 | 
 64 |     def applyFilters(self):
 65 |         tblinfo: MetaResult = self.model.filterMetaResultBySourceTable(self.sourcetables[0].tablename)
 66 |         targettable: TargetTable = TargetTable(sourcesystem=tblinfo.src_system, tablename=tblinfo.target_table, pk=[],
 67 |                                                database=tblinfo.target_database,
 68 |                                                filetype=tblinfo.target_filetype, filepath=tblinfo.target_file_path,
 69 |                                                modeltableorder=tblinfo.src_table_order)
 70 | 
 71 |         for metares in self.model.metaresultlist:
 72 |             filterexpr = matchEqualityOperator(expression=metares.src_col_filter)
 73 |             if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'):
 74 |                 self.filterclause = f"{self.filterclause} {metares.target_col}{filterexpr}".strip()
 75 | 
 76 |             self.filterclause = self.filterclause.strip()
 77 | 
 78 |             if self.filterclause is None:
 79 |                 self.filterclause = ""
 80 | 
 81 |         targettable.df: DataFrame = self.targetdf.filter(self.filterclause)
 82 | 
 83 |     def applyGroupAndAggregation(self):
 84 |         selectlist = []
 85 |         aggregations = {}
 86 |         for metares in self.model.filterMetaResultByTargetTable(self.targettable):
 87 |             if ETL.isNullOrEmpty(metares.target_col_aggregator) is not None:
 88 |                 selectlist.append(metares.target_col)
 89 |             else:
 90 |                 aggregations.update({
 91 |                     metares.target_col: {
 92 |                         'function': metares.target_col_aggregator,
 93 |                         'filter': metares.target_col_aggregator_filter
 94 |                     }
 95 |                 })
 96 | 
 97 |         self.targetdf: DataFrame = self.targetdf.groupby(*selectlist).agg(SparkSQLFunctions.min)
 98 | 
 99 |     def transform(self):
100 |         # Get Unique source table names for Transformation
101 |         srctables = set()
102 |         for metares in self.model.metaresultlist:
103 |             srctables.add(metares.src_table)
104 | 
105 |         # For each source table create SourceTable object and assign transform columns
106 |         for srctable in srctables:
107 |             tablemetaresult = self.model.filterMetaResultBySourceTable(srctbl=srctable)
108 |             tblinfo: MetaResult = tablemetaresult[0]
109 | 
110 |             fklist = []
111 | 
112 |             for item in self.model.datamodel.keys():
113 |                 if self.model.datamodel[item]['fk'] is not None or self.model.datamodel[item]['fk'] is {}:
114 |                     if srctable in self.model.datamodel[item]['fk'].keys():
115 |                         fklist.extend(self.model.datamodel[item]['fk'][srctable]['fk_pk'])
116 | 
117 |             sourcetable: SourceTable = SourceTable(sourcesystem=tblinfo.src_system, tablename=tblinfo.src_table,
118 |                                                    pk=self.model.datamodel[tblinfo.src_table]['pk'],
119 |                                                    fk=fklist,
120 |                                                    database=tblinfo.src_database, filepath=tblinfo.src_file_path,
121 |                                                    filetype=tblinfo.src_filetype,
122 |                                                    modeltableorder=tblinfo.src_table_order)
123 |             self.sourcetables.append(sourcetable)
124 |             for tbl in tablemetaresult:
125 |                 sourcetable.addColumn(name=tbl.src_col, type=tbl.src_col_datatype,
126 |                                       pk=(True, False)[tbl.src_key_constraints.__eq__('pk')],
127 |                                       udf=tbl.udf, udfargs=tbl.udfarguments, casttype=tbl.target_col_datatype,
128 |                                       aliasname=tbl.target_col, filterclause=tbl.src_col_filter, fk={})
129 | 
130 |             # Read file as dataframe
131 |             sourcetable.readFileFromSource(spark=self.spark)
132 | 
133 |         ETL.registerAllUDF(sc=self.spark)
134 |         for sourcetable in self.sourcetables:
135 |             sourcetable.applyTransform()
136 | 
137 |         self.applyJoin()
138 | 
139 |         self.applyFilters()
140 | 
141 |         self.applyGroupAggregation()
142 | 
143 |         self.targetdf.show()
144 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/ITable.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | from abc import ABC
  3 | 
  4 | from pyspark.sql import DataFrame, SparkSession
  5 | 
  6 | from com.vitthalmirji.etl import ETL
  7 | from com.vitthalmirji.etl.CColumn import CColumn
  8 | from com.vitthalmirji.imports.HdfsImport import HdfsImport
  9 | 
 10 | 
 11 | class ITable:
 12 |     sourcesystem: str
 13 |     tablename: str
 14 |     columnlist: []
 15 |     pk: []
 16 |     fk: []
 17 |     database: str
 18 |     filepath: str
 19 |     modeltableorder: int
 20 | 
 21 |     @abc.abstractmethod
 22 |     def getColumnList(self): []
 23 | 
 24 |     @abc.abstractmethod
 25 |     def getPkList(self): []
 26 | 
 27 |     @abc.abstractmethod
 28 |     def getFkList(self): []
 29 | 
 30 |     @abc.abstractmethod
 31 |     def getPath(self): str
 32 | 
 33 |     @abc.abstractmethod
 34 |     def getDatabaseName(self): str
 35 | 
 36 |     @abc.abstractmethod
 37 |     def readFileFromSource(self, park: SparkSession, opt={}, tbl=""): DataFrame
 38 | 
 39 | 
 40 | def matchEqualityOperator(expression):
 41 |     expr = str(expression)
 42 |     if expr is None or expr.__eq__("None"):
 43 |         expr = str("")
 44 |     elif expr.find('eq(') != -1:
 45 |         expr = expr.replace('eq(', '=').replace(')', '')
 46 |     if expr.find('gt') != -1:
 47 |         expr = expr.replace('gt(', '>').replace(')', '')
 48 |     elif expr.find('lt') != -1:
 49 |         expr = expr.replace('lt(', '<').replace(')', '')
 50 |     elif expr.find('lte') != -1:
 51 |         expr = expr.replace('lte(', '<=').replace(')', '')
 52 |     elif expr.find('gte') != -1:
 53 |         expr = expr.replace('gte(', '>=').replace(')', '')
 54 |     elif expr.find('notin') != -1:
 55 |         expr = expr.replace('notin', 'NOT IN')
 56 |     elif expr.find('in') != -1:
 57 |         expr = expr.replace('in', 'IN')
 58 |     elif expr.find('ne') != -1:
 59 |         expr = expr.replace('ne(', '<>').replace(')', '')
 60 |     else:
 61 |         expr = expr.strip()
 62 | 
 63 |     if expr is None or expr.__eq__('None'):
 64 |         expr = ""
 65 | 
 66 |     return expr
 67 | 
 68 | 
 69 | class SourceTable(ITable):
 70 |     def __init__(self, sourcesystem, tablename, pk, fk, database, filetype, filepath, modeltableorder):
 71 |         self.tablename = tablename
 72 |         self.pk = pk
 73 |         self.fk = fk
 74 |         self.database = database
 75 |         self.sourcesystem = sourcesystem
 76 |         self.filepath = filepath
 77 |         self.filetype = filetype
 78 |         self.modeltableorder = modeltableorder
 79 |         self.df: DataFrame = None
 80 |         self.columnlist: list[CColumn] = []
 81 |         self.filterclause = ""
 82 | 
 83 |     def getFilterCondition(self):
 84 |         return self.filterclause
 85 | 
 86 |     def addColumn(self, name, type, pk, udf, udfargs, casttype, aliasname, filterclause, fk={}) -> None:
 87 |         col = CColumn(colname=name, coldatatype=type, pk=pk, udf=udf, udfargs=udfargs, casttype=casttype,
 88 |                       aliasname=aliasname, filterclause=filterclause)
 89 | 
 90 |         filterexpr = matchEqualityOperator(expression=filterclause)
 91 |         if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'):
 92 |             self.filterclause = f"{self.filterclause} {name}{filterexpr}".strip()
 93 | 
 94 |         self.filterclause = self.filterclause.strip()
 95 | 
 96 |         if self.filterclause is None:
 97 |             self.filterclause = ""
 98 | 
 99 |         self.columnlist.append(col)
100 | 
101 |     def getPkList(self) -> []:
102 |         return self.pk
103 | 
104 |     def getFkList(self) -> []:
105 |         return self.fk
106 | 
107 |     def getColumnList(self) -> []:
108 |         return self.columnlist
109 | 
110 |     def getDatabaseName(self) -> str:
111 |         return self.database
112 | 
113 |     def getPath(self) -> str:
114 |         return self.filepath
115 | 
116 |     def readFileFromSource(self, spark: SparkSession, opt={}, tbl="") -> DataFrame:
117 |         importModule = HdfsImport(spark=spark)
118 |         sourcedf = importModule.readFromSource(location=self.filepath, filetype=self.filetype, opt=opt)
119 |         self.df: DataFrame = sourcedf
120 |         return sourcedf
121 | 
122 |     def getDf(self) -> DataFrame:
123 |         return self.df
124 | 
125 |     def applyTransform(self):
126 |         selectexpression = ""
127 |         for _srccol in self.columnlist:
128 |             srccol: CColumn = _srccol
129 |             selectexpression = f"{selectexpression}{srccol.applyUdf()}"
130 | 
131 |         selectexpression = f"{selectexpression}--End"
132 |         selectexpression = selectexpression.strip(',--End')
133 | 
134 |         for p in self.pk:
135 |             selectexpression = f"{selectexpression}, {p} AS {self.tablename}{p}"
136 | 
137 |         for f in self.fk:
138 |             selectexpression = f"{selectexpression}, {f}"
139 | 
140 |         if ETL.isNullOrEmpty(self.filterclause) is not None:
141 |             self.targetdf: DataFrame = self.df.filter(self.filterclause).selectExpr(selectexpression)
142 |         else:
143 |             self.targetdf: DataFrame = self.df.selectExpr(selectexpression)
144 | 
145 |         return self.targetdf
146 | 
147 | 
148 | class TargetTable(ITable, ABC):
149 |     def __init__(self, sourcesystem, tablename, pk, database, filetype, filepath, modeltableorder):
150 |         self.tablename = tablename
151 |         self.pk = pk
152 |         self.database = database
153 |         self.sourcesystem = sourcesystem
154 |         self.filepath = filepath
155 |         self.filetype = filetype
156 |         self.modeltableorder = modeltableorder
157 |         self.df: DataFrame = None
158 |         self.columnlist: list[CColumn] = []
159 |         self.sourcetableslist = list[SourceTable] = []
160 |         self.filterclause = ""
161 |         self.aggregationcolumns = []
162 |         self.aggregationfilter = []
163 | 
164 |     def getPkList(self) -> []:
165 |         return self.pk
166 | 
167 |     def getFkList(self) -> []:
168 |         return self.fk
169 | 
170 |     def getColumnList(self) -> []:
171 |         return self.columnlist
172 | 
173 |     def getDatabaseName(self) -> str:
174 |         return self.database
175 | 
176 |     def getPath(self) -> str:
177 |         return self.filepath
178 | 
179 |     def addColumn(self, name, type, pk, filterclause) -> None:
180 |         col = CColumn(colname=name, coldatatype=type, pk=pk, filterclause=filterclause)
181 | 
182 |         filterexpr = matchEqualityOperator(expression=filterclause)
183 |         if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'):
184 |             self.filterclause = f"{self.filterclause} {name}{filterexpr}".strip()
185 | 
186 |         self.filterclause = self.filterclause.strip()
187 | 
188 |         if self.filterclause is None:
189 |             self.filterclause = ""
190 | 
191 |         self.columnlist.append(col)
192 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/etl/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/meta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/etl/meta/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/imports/HdfsImport.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | from abc import ABC
 3 | 
 4 | from pyspark.sql import SparkSession
 5 | from pyspark.sql.dataframe import DataFrame
 6 | 
 7 | from com.vitthalmirji.etl import ETL
 8 | 
 9 | 
10 | class IImport:
11 |     spark: SparkSession
12 |     system: str
13 |     table: str
14 | 
15 |     @abc.abstractmethod
16 |     def readFromSource(self, location, filetype, opt={}, tbl=""): DataFrame
17 | 
18 |     @abc.abstractmethod
19 |     def cleanup(self, location): None
20 | 
21 | 
22 | class HdfsImport(IImport, ABC):
23 |     def __init__(self, spark: SparkSession):
24 |         self.spark = spark
25 | 
26 |     def readFromSource(self, location, filetype, opt={}, tbl="") -> DataFrame:
27 |         try:
28 |             if str(filetype).lower().__eq__('tbl'):
29 |                 if ETL.isNullOrEmpty(tbl) is not None:
30 |                     try:
31 |                         _ = self.spark.read.table(tbl)
32 |                     except Exception as ex:
33 |                         print(f"Error reading table {tbl}")
34 |                 else:
35 |                     print(f"Invalid table {tbl} -Table do not exist in SQL Context: ")
36 |             elif str(filetype).lower().__eq__('text'):
37 |                 return self.spark.read.text(paths=location, wholetext=True).toDF('line')
38 |             elif str(filetype).lower().__eq__('csv'):
39 |                 return self.spark.read.options(header=True, inferSchema=True).csv(path=location)
40 |             elif str(filetype).lower().__eq__('xml'):
41 |                 print(opt)
42 |                 return self.spark.read.format('com.databricks.spark.xml').options(rowTag='HotelDescriptiveContent',
43 |                                                                                   rootTag='HotelDescriptiveContents',
44 |                                                                                   valueTag='xmlvaluetag',
45 |                                                                                   attributePrefix="@").load(
46 |                     path=location)
47 |             elif str(filetype).lower().__eq__('json'):
48 |                 return self.spark.read.options(options=opt).json(path=location)
49 |             elif str(filetype).lower().__eq__('orc'):
50 |                 return self.spark.read.options(options=opt).orc(location)
51 |             elif str(filetype).lower().__eq__('parquet'):
52 |                 return self.spark.read.options(options=opt).parquet(location)
53 |             else:
54 |                 raise "Invalid filetype: " + filetype
55 |         except Exception as ex:
56 |             print("Error reading file in Spark of filetype " + filetype + " Error details: " + str(ex))
57 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/imports/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/imports/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/kafka/Logger.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import logging
 3 | 
 4 | from kafka import KafkaProducer
 5 | 
 6 | logger_names = []
 7 | 
 8 | 
 9 | class Logger(logging.Handler):
10 | 
11 |     def __init__(self, Jobname, hostlist, topic, tls=None):
12 |         self.__level = "INFO"
13 |         self.__formatter = "%(asctime)s %(levelname)-8s %(message)s"
14 |         self.__local_file_path = Jobname + ".log"
15 |         logging.Handler.__init__(self)
16 |         self.producer = KafkaProducer(bootstrap_servers=hostlist,
17 |                                       value_serializer=lambda v: json.dumps(v).encode('utf-8'),
18 |                                       linger_ms=10)
19 |         self.topic = topic
20 | 
21 |     def get(self, name):
22 |         global logger_names
23 |         logger = logging.getLogger(name)
24 |         logger.setLevel(self.__level)
25 |         if name not in logger_names:
26 |             handler = logging.FileHandler(self.__local_file_path)
27 |             formatter = logging.Formatter(self.__formatter)
28 |             handler.setFormatter(formatter)
29 |             handler.setLevel(self.__level)
30 |             logger.addHandler(handler)
31 |             logger_names.append(name)
32 |         return logger
33 | 
34 |     # Write log to kafka topic
35 |     def emit(self, record):
36 |         # Avoid infinite loop by checking if Kafka's logs are looping in messages
37 |         if 'kafka.' in record.name:
38 |             return
39 |         try:
40 |             # apply the logger formatter
41 |             msg = self.format(record)
42 |             self.producer.send(self.topic, {'message': msg})
43 |             self.flush(timeout=1.0)
44 |         except Exception:
45 |             logging.Handler.handleError(self, record)
46 | 
47 |     def flush(self, timeout=None):
48 |         # Flush all the objects
49 |         self.producer.flush(timeout=timeout)
50 | 
51 |     def close(self):
52 |         # Close producer and clean up
53 |         self.acquire()
54 |         try:
55 |             if self.producer:
56 |                 self.producer.close()
57 |             logging.Handler.close(self)
58 |         finally:
59 |             self.release()
60 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/kafka/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/kafka/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/main.py:
--------------------------------------------------------------------------------
 1 | # Main file
 2 | 
 3 | # Create objects & invoke methods required for your ETL process
 4 | import datetime
 5 | import logging
 6 | 
 7 | from utils.Utilities import init_logging
 8 | 
 9 | if __name__ == '__main__':
10 |     init_logging(log_time_stamp=datetime.datetime.now().isoformat().__str__())
11 |     logging.debug("Hello")
12 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/mapper/Mapper.py:
--------------------------------------------------------------------------------
  1 | import abc
  2 | 
  3 | from pyspark.sql import SparkSession
  4 | from pyspark.sql.dataframe import DataFrame
  5 | from pyspark.sql.types import DataType, StructType, ArrayType, StructField, LongType
  6 | 
  7 | 
  8 | class IMapper:
  9 |     @abc.abstractmethod
 10 |     def getDataframeSchema(self, df: DataFrame): DataFrame
 11 | 
 12 |     def createDDL(self, df: DataFrame, database, table, location): str
 13 | 
 14 | 
 15 | def generate_deterministic_surrogate_key(spark: SparkSession, df: DataFrame, keyOffset=1, colName="keyName"):
 16 |     try:
 17 |         new_schema = StructType([StructField(colName, LongType(), True)] + df.schema.fields)
 18 |         new_rdd = df.rdd.zipWithIndex().map(lambda row: ([row[1] + keyOffset] + list(row[0])))
 19 |         max_key = new_rdd.map(lambda x: x[0]).max()
 20 |         final_df = spark.createDataFrame(new_rdd, new_schema)
 21 |         return final_df, max_key, "success", "errorNotFound"
 22 |     except Exception as e:
 23 |         return df, keyOffset, "error", e
 24 | 
 25 | 
 26 | class ComplexDataMapper(IMapper):
 27 |     outerselects = []
 28 | 
 29 |     def __init__(self, sc):
 30 |         self.spark: SparkSession = sc
 31 | 
 32 |     def getDataframeSchema(self, df: DataFrame) -> StructType:
 33 |         return df.schema
 34 | 
 35 |     def createDDL(self, df: DataFrame, database, table, location):
 36 |         newline = '\n'
 37 |         ddl = str("")
 38 |         if database.__eq__(""):
 39 |             ddl = str(f"CREATE EXTERNAL TABLE {table} {newline}({newline}")
 40 |         else:
 41 |             ddl = str(f"CREATE EXTERNAL TABLE {database}.{table} {newline}({newline}")
 42 | 
 43 |         bigarraytypes: list[(str, str)] = None
 44 | 
 45 |         for field in df.schema.fields:
 46 |             if len(field.dataType.simpleString()) <= 100000:
 47 |                 ddl = ddl + str(f"`{field.name}` {field.dataType.simpleString()},{newline}")
 48 |             else:
 49 |                 print(f"Found big tag {field.name} skipping.. as the type definition exceeds more than value set in "
 50 |                       f"Ambari > Hive > Configs > Advanced > Custom hive-site hive.metastore.max.typename.length=100000")
 51 |                 # bigarraytypes += list[(field.name, field.dataType.sql)]
 52 | 
 53 |         ddl = ddl.rstrip(',\n')
 54 | 
 55 |         ddl += f"{newline}) {newline}" \
 56 |                f"STORED AS PARQUET {newline}" \
 57 |                f"LOCATION {location};{newline};"
 58 | 
 59 |         return ddl
 60 | 
 61 |     def createViews(self, df: DataFrame, root_table_name='xmltable',
 62 |                     columns_cascade_to_leaf_level_with_alias=None) -> {}:
 63 |         views = {}
 64 |         views, xpaths = self.complexTypeIterator(viewname="", viewpath="", database="",
 65 |                                                  table=root_table_name, level=0,
 66 |                                                  dtype=df.schema, acc={}, root_table_name=root_table_name,
 67 |                                                  columns_cascade_to_leaf_level=columns_cascade_to_leaf_level_with_alias)
 68 |         return views, xpaths
 69 | 
 70 |     def handleStructType(self, viewname, viewpath, database, table, level, dtype, columns_cascade_to_leaf_level, acc={},
 71 |                          xpath=[]) -> {}:
 72 |         structtype: StructType = dtype
 73 |         selcols = []
 74 |         if columns_cascade_to_leaf_level is not None and len(columns_cascade_to_leaf_level) > 0:
 75 |             cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}"
 76 |         else:
 77 |             cascade_columns = ""
 78 |         if viewname is None or str(viewname).__eq__(""):
 79 |             viewname = table
 80 |         for field in structtype.fields:
 81 |             if str(field.dataType).lower().startswith("struct"):
 82 |                 selcols.append(f"t{level}.`{field.name}`")
 83 |                 viewname = field.name
 84 |                 viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
 85 |                 query = f"SELECT t{level}.`{field.name}`.*, t{level}.surrogate_id_{table}, " \
 86 |                         f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
 87 |                         f"{cascade_columns} " \
 88 |                         f"FROM {table} t{level} "
 89 |                 keynm = f"{table.replace('.', '_')}_{viewname}"
 90 |                 acc.update({keynm: query})
 91 |                 self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm, level=level,
 92 |                                       dtype=field.dataType, acc=acc,
 93 |                                       columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
 94 |             elif str(field.dataType).lower().startswith("array"):
 95 |                 selcols.append(f"t{level}.`{field.name}`")
 96 |                 arrtype: ArrayType = field.dataType
 97 |                 if str(arrtype.elementType).lower().startswith("struct"):
 98 |                     viewname = field.name
 99 |                     viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
100 |                     query = f"SELECT v{level}.*, t{level}.surrogate_id_{table}, " \
101 |                             f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
102 |                             f"{cascade_columns} " \
103 |                             f"FROM {table} t{level} LATERAL VIEW INLINE(t{level}.`{field.name}`) v{level}"
104 |                     keynm = f"{table.replace('.', '_')}_{viewname}"
105 |                     acc.update({keynm: query})
106 |                     self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm,
107 |                                           level=level + 1, dtype=arrtype.elementType, acc=acc,
108 |                                           columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
109 |                 else:
110 |                     viewname = field.name
111 |                     viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
112 |                     query = f"SELECT v{level}.col AS {viewname}, " \
113 |                             f"t{level}.surrogate_id_{table}, " \
114 |                             f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
115 |                             f"{cascade_columns} " \
116 |                             f"FROM {table} t{level} " \
117 |                             f"LATERAL VIEW EXPLODE(t{level}.`{field.name}`) v{level}"
118 |                     keynm = f"{table.replace('.', '_')}_{viewname}"
119 |                     acc.update({keynm: query})
120 |                     xpath.append(f'{viewpath.replace(".", "/")}/{field.name}')
121 |             else:
122 |                 xpath.append(f'{viewpath.replace(".", "/")}/{field.name}')
123 |                 selcols.append(f"t{level}.`{field.name}`")
124 | 
125 |             if len(selcols) > 0:
126 |                 query = f"SELECT {','.join(selcols)}, " \
127 |                         f"monotonically_increasing_id() AS surrogate_id_{table} " \
128 |                         f"{cascade_columns} " \
129 |                         f"FROM {table} t{level}"
130 |                 keynm = f"{table.replace('.', '_')}_{viewname}_outer"
131 |                 # acc.update({keynm: query})
132 |         return acc
133 | 
134 |     def handleArrayType(self, viewname, viewpath, database, table, level, dtype: ArrayType,
135 |                         columns_cascade_to_leaf_level, acc={}, xpath=[]) -> {}:
136 |         if columns_cascade_to_leaf_level is not None and len(columns_cascade_to_leaf_level) > 0:
137 |             cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}"
138 |         else:
139 |             cascade_columns = ""
140 |         if str(dtype.elementType).lower().startswith("struct"):
141 |             arr_struct_type: StructType = dtype.elementType
142 |             viewname = arr_struct_type.name
143 |             viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
144 |             query = f"SELECT v{level}.*, t{level}.surrogate_id_{table}," \
145 |                     f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
146 |                     f"{cascade_columns} " \
147 |                     f"FROM {table} t{level} " \
148 |                     f"LATERAL VIEW INLINE(t{level}.`{arr_struct_type.name}`) v{level}"
149 |             keynm = f"{table.replace('.', '_')}_{viewname}"
150 |             acc.update({keynm: query})
151 |             self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm,
152 |                                   level=level + 1, dtype=arr_struct_type, acc=acc,
153 |                                   columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
154 |         else:
155 |             viewname = viewname
156 |             viewpath = viewpath
157 |             query = f"SELECT v{level}.col AS {viewname}, t{level}.surrogate_id_{table}, " \
158 |                     f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
159 |                     f"{cascade_columns} " \
160 |                     f"FROM {table} t{level} " \
161 |                     f"LATERAL VIEW EXPLODE(t{level}.`{viewname}`) v{level}"
162 |             keynm = f"{table.replace('.', '_')}_{viewname}"
163 |             acc.update({keynm: query})
164 |             xpath.append(f'{viewpath.replace(".", "/")}/{viewname}')
165 |         return acc, xpath
166 | 
167 |     def complexTypeIterator(self, viewname, viewpath, database, table, level,
168 |                             dtype: DataType, root_table_name, columns_cascade_to_leaf_level, acc={}, xpath=[]) -> {}:
169 |         if viewname is None or str(viewname).__eq__(""):
170 |             keynm = f"{table.replace('.', '_')}"
171 |             if columns_cascade_to_leaf_level is not None:
172 |                 cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}"
173 |             else:
174 |                 cascade_columns = ""
175 |             query = f"SELECT t{level}.*, " \
176 |                     f"monotonically_increasing_id() AS surrogate_id_{table} " \
177 |                     f"{cascade_columns} " \
178 |                     f"FROM {root_table_name} t{level}"
179 |             acc.update({keynm: query})
180 |             table = keynm
181 | 
182 |             columns_cascade_to_leaf_level = list(
183 |                 map(lambda c: f"{c.split('AS')[-1].strip()} AS {c.split('AS')[-1].strip()}",
184 |                     columns_cascade_to_leaf_level))
185 | 
186 |         if dtype.typeName().lower().__eq__("struct"):
187 |             self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=table, level=level,
188 |                                   dtype=dtype, acc=acc, xpath=[],
189 |                                   columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
190 |         elif dtype.typeName().lower().__eq__("array"):
191 |             self.handleArrayType(viewname=viewname, viewpath=viewpath, database=database, table=table, level=level,
192 |                                  dtype=dtype, acc=acc, xpath=[],
193 |                                  columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
194 |         else:
195 |             xpath.append(f'{viewpath.replace(".", "/")}/{viewname}')
196 |             return acc, xpath
197 |         return acc, xpath
198 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/mapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/mapper/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/objects/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/enums/Environments.py:
--------------------------------------------------------------------------------
 1 | import abc
 2 | import socket
 3 | 
 4 | from pyspark.sql import SparkSession
 5 | 
 6 | 
 7 | class Environment:
 8 |     def __init__(self, name, nameservice, zookeeperquorum, historyserver):
 9 |         self.name = name
10 |         self.nameservice = nameservice
11 |         self.zookeeperquorum = zookeeperquorum
12 |         self.historyserver = historyserver
13 | 
14 | 
15 | class IEnvironment:
16 |     @abc.abstractmethod
17 |     def getEnvironment(self, sc: SparkSession): Environment
18 | 
19 |     def getEnvironmentByServer(self): Environment
20 | 
21 | 
22 | class Environments(IEnvironment):
23 |     def __init__(self):
24 |         self.local = Environment("local", "", "localhost", "localhost:18081")
25 |         self.dev = Environment("dev", "", "localhost", "localhost:18081")
26 |         self.intg = Environment("intg", "", "localhost", "localhost:18081")
27 |         self.test = Environment("test", "", "localhost", "localhost:18081")
28 |         self.prod = Environment("prod", "", "localhost", "localhost:18081")
29 | 
30 |     def getEnvironment(self, sc: SparkSession) -> Environment:
31 |         hostname = socket.gethostname()
32 |         if hostname.lower().startswith("v") or hostname.lower().startswith("u"):
33 |             return self.local
34 |         elif hostname.lower().startswith("intg"):
35 |             return self.intg
36 |         elif hostname.lower().startswith("test"):
37 |             return self.test
38 |         elif hostname.lower().startswith("prod"):
39 |             return self.prod
40 | 
41 |     def getEnvironmentByServer(self) -> Environment:
42 |         hostname = socket.gethostname()
43 |         if hostname.lower().startswith("v") or hostname.lower().startswith("u"):
44 |             return self.local
45 |         elif hostname.lower().startswith("intg"):
46 |             return self.intg
47 |         elif hostname.lower().startswith("test"):
48 |             return self.test
49 |         elif hostname.lower().startswith("prod"):
50 |             return self.prod
51 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/enums/Zones.py:
--------------------------------------------------------------------------------
1 | # ToDo - More extentions (if any) to various zones
2 | class Zones:
3 |     def __init__(self):
4 |         self.stage = "stage"
5 |         self.work = "work"
6 |         self.publish = "publish"
7 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/enums/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/objects/enums/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/MockupData.py:
--------------------------------------------------------------------------------
  1 | from random import Random
  2 | from typing import Optional, Any
  3 | 
  4 | from pyspark.sql.types import *
  5 | 
  6 | 
  7 | # ToDo Yet to complete Random data generation
  8 | class Maybe(object):
  9 |     def get_or_else(self, default):
 10 |         return self.value if isinstance(self, Just) else default
 11 | 
 12 | 
 13 | class Just(Maybe):
 14 |     def __init__(self, value):
 15 |         self.value = value
 16 | 
 17 | 
 18 | class Nothing(Maybe):
 19 |     pass
 20 | 
 21 | 
 22 | # Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random
 23 | # values; instead, they're biased to return "interesting" values (such as maximum / minimum values)
 24 | # with higher probability.
 25 | class MockupData:
 26 |     # The conditional probability of a non-null value being drawn from a set of "interesting" values
 27 |     # instead of being chosen uniformly at random.
 28 |     PROBABILITY_OF_INTERESTING_VALUE: float = 0.5
 29 | 
 30 |     # The probability of the generated value being null
 31 |     PROBABILITY_OF_NULL: float = 0.1
 32 | 
 33 |     MAX_STR_LEN: int = 1024
 34 |     MAX_ARR_SIZE: int = 128
 35 |     MAX_MAP_SIZE: int = 128
 36 | 
 37 |     # Returns a randomly generated schema, based on the given accepted types.
 38 |     # @param numFields the number of fields in this schema
 39 |     # @param acceptedTypes types to draw from.
 40 |     def randomSchema(self, rand: Random, numFields: int, acceptedTypes: list[DataType]) -> StructType:
 41 |         structfields = []
 42 |         i = 0
 43 |         while i < numFields:
 44 |             dt = acceptedTypes[rand.randint(1, len(acceptedTypes))]
 45 |             structfields.append(StructField(f"col_{i}", dt, nullable=bool(rand.getrandbits(1))))
 46 |         return StructType(structfields)
 47 | 
 48 |     # Returns a function which generates random values for the given `DataType`, or `None` if no
 49 |     # random data generator is defined for that data type. The generated values will use an external
 50 |     # representation of the data type; for example, the random generator for `DateType` will return
 51 |     # instances of [[java.sql.Date]] and the generator for `StructType` will return a [[Row]].
 52 |     # For a `UserDefinedType` for a class X, an instance of class X is returned.
 53 |     # #@param dataType the type to generate values for
 54 |     # @param nullable whether null values should be generated
 55 |     # @param rand an optional random number generator
 56 |     # @return a function which can be called to generate random values.
 57 |     def forType(self, dataType: DataType, nullable: bool, rand: Random = Random()) -> Optional[Any]:
 58 |         return Optional[Any]()
 59 | 
 60 |     # Generates a random row for `schema`.
 61 |     def randomRow(self, rand: Random, schema: StructType) -> Row:
 62 |         fields = list(StructField)
 63 |         for f in schema.fields:
 64 |             if str(f.dataType).lower().__eq__("arraytype"):
 65 |                 data = None
 66 |                 if f.nullable and rand.random() <= self.PROBABILITY_OF_NULL:
 67 |                     data = None
 68 |                 else:
 69 |                     arr = []
 70 |                     n = 1
 71 |                     i = 0
 72 |                     _f: ArrayType = f.dataType()
 73 |                     generator = self.forType(_f.elementType, f.nullable, rand)
 74 |                     assert (generator.isDefined, "Unsupported")
 75 |                     gen = generator.get
 76 |                     while i < n:
 77 |                         arr.append(gen)
 78 |                         i = i + 1
 79 |                     data = arr
 80 |                 fields.append(data)
 81 |             elif str(f.dataType).lower().__eq__("structtype"):
 82 |                 _f: StructType = f
 83 |                 for c in _f:
 84 |                     fields.append(self.randomRow(rand, StructType(c.dataType())))
 85 |             else:
 86 |                 generator = self.forType(f.dataType, f.nullable, rand)
 87 |                 assert (generator.isDefined, "Unsupported")
 88 |                 gen = generator.get
 89 |                 fields.append(gen)
 90 |         return Row(*fields)
 91 | 
 92 |     # Returns a random nested schema. This will randomly generate structs and arrays drawn from
 93 |     # acceptedTypes.
 94 |     def randomNestedSchema(self, rand: Random, totalFields: int, acceptedTypes: list[DataType]) -> StructType:
 95 |         fields = []
 96 |         i = 0
 97 |         numFields = totalFields
 98 |         while numFields > 0:
 99 |             v = rand.randint(0, 3)
100 |             if v is 0:
101 |                 # Simple type
102 |                 dt = acceptedTypes[rand.randint(0, len(acceptedTypes))]
103 |                 fields.append(StructField(f"col_{i}", dt, bool(rand.getrandbits(1))))
104 |                 numFields = -1
105 |             elif v is 1:
106 |                 # Array
107 |                 dt = acceptedTypes[rand.randint(0, len(acceptedTypes))]
108 |                 fields.append(StructField(f"col_{i}", ArrayType(dt), bool(rand.getrandbits(1))))
109 |                 numFields = -1
110 |             else:
111 |                 n = max(rand.randint(0, numFields), 1)
112 |                 nested = self.randomNestedSchema(rand, n, acceptedTypes)
113 |                 fields.append(StructField("col_" + i, nested, bool(rand.getrandbits(1))))
114 |                 numFields = numFields - n
115 | 
116 |             i = i + 1
117 |         return StructType(fields)
118 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/utils/__init__.py


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/audit_util.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | 
 4 | def audit_action(action):
 5 |     def audit_decorator(func):
 6 |         def audit(*args, **kwargs):
 7 |             # Invoke the wrapped function first
 8 |             retval = func(*args, **kwargs)
 9 |             # Now do something here with retval and/or action
10 |             logging.debug(f'Executed {action}, Callback return value {retval}')
11 |             return retval
12 |         return audit
13 |     return audit_decorator
14 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/comprehensive_logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | import sys
 4 | from pathlib import Path
 5 | 
 6 | from com.vitthalmirji.utils.constants import JOB_START_TIME
 7 | from com.vitthalmirji.utils.helpers import read_json_get_dict, get_project_root
 8 | 
 9 | 
10 | def init_logging(job_name, log_time_stamp=JOB_START_TIME, log_path=f'{get_project_root()}/logs/python',
11 |                  log_properties_path=f"{get_project_root()}/conf/python/logging-properties.json"):
12 |     """
13 |     Initiates the logging object with given configurations
14 | 
15 |     Args:
16 |         :param log_properties_path: Location of properties file.
17 |                                     default to local project folder's <project-root>/conf/python/logging-properties.json
18 |         :param job_name: Name of the application
19 |         :param log_time_stamp: Timestamp to append in log file name
20 |         :param log_path: Location to store logs.
21 |                          Default location <project-root>/logs/python/
22 | 
23 |     Returns: N/A
24 |     """
25 |     Path(log_path).mkdir(parents=True, exist_ok=True)
26 |     log_conf = read_json_get_dict(json_path=log_properties_path)
27 |     log_file = f"{log_path}/log-{job_name}_{log_time_stamp}.log"
28 |     log_conf['handlers']['file']['filename'] = log_file
29 | 
30 |     # In case of Unit test cases do not log to file
31 |     if 'unittest' in sys.modules.keys():
32 |         log_conf['handlers'] = {'console': log_conf['handlers']['console']}
33 |         log_conf['root']['handlers'] = ['console']
34 | 
35 |     print('Logging initiating using below properties')
36 |     print(log_conf)
37 |     logging.config.dictConfig(log_conf)
38 |     logging.info(f'Logging initiated; appending logs to {log_file}')
39 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/constants.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 | 
3 | from com.vitthalmirji.utils.helpers import get_user
4 | 
5 | USER = get_user()
6 | JOB_START_TIME = datetime.now().strftime('%Y-%m-%dT%H-%M-%S-%f')
7 | SPARK_APPLICATION_NAME = f"Spark application launched by {USER}"
8 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/data_quality.py:
--------------------------------------------------------------------------------
  1 | import datetime
  2 | import logging
  3 | from typing import List
  4 | 
  5 | from pyspark.sql import DataFrame
  6 | from pyspark.sql.functions import count
  7 | 
  8 | from com.vitthalmirji.utils.helpers import create_dir, log_exception_details
  9 | from com.vitthalmirji.utils.spark import get_or_create_spark_session
 10 | 
 11 | 
 12 | class Rule(object):
 13 |     def __init__(self, rule_id: int, name: str, description: str, rule_type: str, columns: List[str] = None,
 14 |                  query: str = None):
 15 |         self.rule_id = rule_id
 16 |         self.name = name
 17 |         self.description = description
 18 |         self.rule_type = rule_type
 19 |         self.columns = columns if columns else None
 20 |         self.query = query if query else None
 21 | 
 22 | 
 23 | class RuleExecutionResult:
 24 |     def __init__(self, rule: Rule, status, pass_count: int, fail_count: int, total_count):
 25 |         self.rule = rule
 26 |         self.status = status
 27 |         self.pass_count = pass_count
 28 |         self.fail_count = fail_count
 29 |         self.total_count = total_count
 30 | 
 31 | 
 32 | class DataQuality(object):
 33 |     def __init__(self, dq_id: int, rules: list[dict] = None, email_execution_report_to: str = None,
 34 |                  execution_reports_dir: str = None):
 35 |         logging.info(
 36 |             f"Initializing Data quality service for DQ ID {dq_id}, reports will be available in file {execution_reports_dir}")
 37 |         self.html_report = None
 38 |         self.df = None
 39 |         self.total_count = None
 40 |         self.execution_results = None
 41 |         self.dq_id = dq_id
 42 |         self.rules: List[Rule] = [Rule(**rule) for rule in rules] if rules else None
 43 |         self.email_execution_report_to = email_execution_report_to if email_execution_report_to else None
 44 |         self.spark = get_or_create_spark_session()
 45 |         self.yarn_id = self.spark.sparkContext.applicationId
 46 |         self.execution_reports_dir = execution_reports_dir if execution_reports_dir else None
 47 |         if self.execution_reports_dir:
 48 |             create_dir(self.execution_reports_dir)
 49 | 
 50 |     def execute_unique_rule(self, rule: Rule):
 51 |         """
 52 |         Executes Duplicates check on given Primary keys in `rule`
 53 | 
 54 |         Args:
 55 |             :param rule: Rule of type `unique` having list of primary keys
 56 | 
 57 |         Returns:
 58 |             :return: RuleExecutionResult with status fail if duplicates are present pass otherwise and count of duplicates
 59 | 
 60 |         Exceptions:
 61 |             :exception: Thrown by calling functions called in this function
 62 |         """
 63 |         logging.warning(f"Executing DQ Rule for {rule.name} on {rule.columns}")
 64 |         dups_count = self.df.select(rule.columns).groupby(rule.columns).agg(count("*").alias('cnt')).alias(
 65 |             'cnt').filter('cnt > 1').count()
 66 | 
 67 |         return RuleExecutionResult(rule, 'fail' if dups_count > 0 else 'pass', self.total_count - dups_count,
 68 |                                    dups_count, self.total_count)
 69 | 
 70 |     def execute_not_null_rule(self, rule: Rule):
 71 |         """
 72 |         Executes Not null check on given list of columns in `rule`
 73 | 
 74 |         Args:
 75 |             :param rule: Rule of type `not null` having list of columns potentially not null
 76 | 
 77 |         Returns:
 78 |             :return: RuleExecutionResult with status fail if column values are null & pass otherwise and count of null records
 79 | 
 80 |         Exceptions:
 81 |             :exception: Thrown by calling functions called in this function
 82 |         """
 83 |         logging.warning(f"Executing DQ Rule for {rule.name} on {rule.columns}")
 84 |         filter_string = ' OR '.join(list(map(lambda c: f'{c} IS NULL OR TRIM({c}) = ""', rule.columns)))
 85 |         not_null_count = self.df.select(rule.columns).filter(filter_string).count()
 86 |         return RuleExecutionResult(rule, 'fail' if not_null_count > 0 else 'pass', self.total_count - not_null_count,
 87 |                                    not_null_count, self.total_count)
 88 | 
 89 |     def execute_query_rule(self, rule: Rule):
 90 |         """
 91 |         Executes query given in `rule`
 92 |         This is in case of custom data quality rule given in form of query
 93 |         Args:
 94 |             :param rule: Rule of type `query` having query to execute
 95 | 
 96 |         Returns:
 97 |             :return: RuleExecutionResult with status fail if duplicates are present pass otherwise and count of duplicates
 98 | 
 99 |         Exceptions:
100 |             :exception: Thrown by calling functions called in this function
101 |         """
102 |         self.df.createOrReplaceTempView('temp')
103 |         query = rule.query
104 |         logging.warning(f"Executing DQ Rule for {rule.name} using query {rule.query}")
105 |         query_count = self.spark.sql(query).count()
106 |         return RuleExecutionResult(rule, 'fail' if query_count > 0 else 'pass',
107 |                                    self.total_count - query_count,
108 |                                    query_count, self.total_count)
109 | 
110 |     def execute_rules(self, df: DataFrame) -> tuple[bool, str]:
111 |         """
112 |         Executes list of rules (data quality checks) given on dataframe's data
113 |         Args:
114 |             :param df: Dataframe on which quality checks to be executed
115 |             :param rules: List of rules mapped to Rule type
116 | 
117 |         Returns:
118 |             :return: boolean status True if all rules executed successfully without any failures, False otherwise and
119 |                      HTML report of details executed rules
120 | 
121 |         Exceptions:
122 |             :exception: All exceptions thrown by calling functions called in this function
123 |         """
124 |         logging.info("Starting data quality rules executions..")
125 |         self.execution_results: List[RuleExecutionResult] = []
126 |         self.df = df
127 |         self.total_count = self.df.count()
128 |         for unique_rule in list(filter(lambda r: r.rule_type.__eq__('unique'), self.rules)):
129 |             self.execution_results.append(self.execute_unique_rule(unique_rule))
130 | 
131 |         for not_null_rule in list(filter(lambda r: r.rule_type.__eq__('not null'), self.rules)):
132 |             self.execution_results.append(self.execute_not_null_rule(not_null_rule))
133 | 
134 |         for query_rule in list(filter(lambda r: r.rule_type.__eq__('query'), self.rules)):
135 |             self.execution_results.append(self.execute_query_rule(query_rule))
136 | 
137 |         return False if list(filter(lambda exec_result: exec_result.status.__eq__('fail'), self.execution_results)) \
138 |             else True, self.generate_report()
139 | 
140 |     def generate_report(self):
141 |         """
142 |         Generates HTML report of result of executed data quality checks
143 | 
144 |         Args: N/A
145 | 
146 |         Returns:
147 |             :return: self.html_report a HTML report of details about executed DQ checks
148 | 
149 |         Exceptions:
150 |             :exception: All exception thrown by calling functions called in this function
151 |         """
152 |         logging.info(f"Preparing Data quality rules report for {self.dq_id}")
153 |         table_header = ' '.join(list(
154 |             map(lambda header: f"<th>{header}</th>", ["Yarn Application Id", "DQ ID", "Rule ID", "Rule Name",
155 |                                                       "Rule type", "Description", "Columns/Query", "Pass Count",
156 |                                                       "Fail Count",
157 |                                                       "Total Count"])))
158 | 
159 |         def rules_and_result(result: RuleExecutionResult):
160 |             table_data = [self.yarn_id,
161 |                           self.dq_id,
162 |                           result.rule.rule_id,
163 |                           result.rule.name,
164 |                           result.rule.rule_type,
165 |                           result.rule.description,
166 |                           result.rule.columns,
167 |                           result.pass_count,
168 |                           result.fail_count,
169 |                           result.total_count
170 |                           ]
171 |             return ' '.join(list(map(lambda d: f"<td>{d}</td>", table_data)))
172 | 
173 |         failed_rules = list(filter(lambda result: result.status.__eq__('fail'), self.execution_results))
174 |         failed_details = ' '.join(list(map(lambda result: f"<tr>{rules_and_result(result)}</tr>", failed_rules)))
175 |         failure_table = f'<h3 style="font-family:arial">Failed DQ details</h3>' \
176 |                         f'<table border="3" style="width:100%"><tr style="text-align:left;background-color:#FF6347">{table_header}</tr>' \
177 |                         f'{failed_details}</table>' if failed_rules else ""
178 | 
179 |         passed_rules = list(filter(lambda result: result.status.__eq__('pass'), self.execution_results))
180 |         passed_details = ' '.join(list(map(lambda result: f"<tr>{rules_and_result(result)}</tr>", passed_rules)))
181 |         passed_table = f'<h3 style="font-family:arial">Succeeded DQ details</h3>' \
182 |                        f'<table border="3" style="width:100%"><tr style="text-align:left;background-color:#33FFBD">{table_header}</tr>' \
183 |                        f'{passed_details}</table>' if passed_rules else ""
184 | 
185 |         opening_statement = "<html><body><P>Team,<br/><br/>" \
186 |                             f"Data Quality check finished successfully for <b>DQ ID = {self.dq_id}</b>" \
187 |                             f"{', with failures. ' if failed_rules else '. '}" \
188 |                             "Check details in below table of metrics.</P>"
189 |         closing_statement = "<br/><br/>" \
190 |                             f"Executed on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')},<br/>" \
191 |                             "Thanks</body></html><br/>"
192 | 
193 |         self.html_report = f"{opening_statement}" \
194 |                            f"{failure_table if failed_rules else ''}" \
195 |                            f"{passed_table if passed_rules else ''}" \
196 |                            f"{closing_statement}"
197 | 
198 |         return self.html_report
199 | 
200 |     def write_report_to_html(self, file_name):
201 |         """
202 |         Writes Data Quality rules execution results to a html file
203 | 
204 |         Args:
205 |             :param file_name: name of file to write report as HTML file
206 | 
207 |         Returns:
208 |             :return: N/A
209 | 
210 |         Exceptions:
211 |             :exception Throws exception if unable to write into html file but will not halt the execution process
212 |         """
213 |         logging.info(f"Writing data quality execution report to html file {self.execution_reports_dir}/{file_name}")
214 |         try:
215 |             if not self.execution_reports_dir:
216 |                 raise Exception("Empty file path")
217 |             f = open(self.execution_reports_dir + "/" + file_name, "w")
218 |             f.write(self.html_report)
219 |             f.close()
220 |         except Exception as ex:
221 |             log_exception_details(message="Error writing report to html, skipping writing report",
222 |                                   exception_object=ex)
223 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/helpers.py:
--------------------------------------------------------------------------------
  1 | import getpass
  2 | import json
  3 | import logging
  4 | import traceback
  5 | from pathlib import Path
  6 | 
  7 | import isodate
  8 | from isodate import ISO8601Error
  9 | 
 10 | 
 11 | def create_dir(dir_path):
 12 |     """
 13 |     Creates directory from given path
 14 |     :param dir_path: relative path of directory to create
 15 |     :return: N/A
 16 |     """
 17 |     try:
 18 |         Path(dir_path).mkdir(parents=True, exist_ok=True)
 19 |     except Exception as ex:
 20 |         msg = f"Error creating directory from given relative path {dir_path}"
 21 |         log_exception_details(message=msg, exception_object=ex)
 22 |         raise ex
 23 | 
 24 | 
 25 | def get_user():
 26 |     """
 27 |     Fetches username of the executor
 28 | 
 29 |     Args:
 30 | 
 31 |     Returns:
 32 |         :return: username of the executor / logged in machine
 33 |     """
 34 |     return getpass.getuser()
 35 | 
 36 | 
 37 | def is_null_or_empty(obj) -> bool:
 38 |     """
 39 |     Checks if an object is null or empty if object is of type string
 40 | 
 41 |     Args:
 42 |         :param obj: object / variable to validate
 43 | 
 44 |     Returns:
 45 |         :return: bool True of object is null or string is empty False otherwise
 46 |     """
 47 |     if obj is None:
 48 |         return True
 49 |     elif type(obj) is str and str(obj).strip().__eq__(''):
 50 |         return True
 51 |     else:
 52 |         return False
 53 | 
 54 | 
 55 | def get_project_root() -> Path:
 56 |     """
 57 |     Identifies project root, Returns project root, the repository root
 58 |     Args:
 59 | 
 60 |     Returns:
 61 |         :return: project's root path as type Path
 62 |     """
 63 |     return Path(__file__).parent.parent.parent.parent.parent
 64 | 
 65 | 
 66 | def read_json_get_dict(json_path) -> dict:
 67 |     """
 68 |     Reads json file from given `json_path` & returns as python dict
 69 |     Args:
 70 |         :param :json_path : Absolute or Relative path of json file to read & convert
 71 | 
 72 |     Return:
 73 |         :return :json_as_dict: JSON content as dictionary type
 74 |     """
 75 |     try:
 76 |         with open(json_path, 'r') as stream:
 77 |             json_as_dict = json.load(stream)
 78 |         stream.close()
 79 |         return json_as_dict
 80 |     except Exception as ex:
 81 |         log_exception_details(f'Error reading json file {json_path}, error traceback below', ex)
 82 | 
 83 | 
 84 | def log_exception_details(message, exception_object):
 85 |     """
 86 |     Logs the exception to console & log file for every exception
 87 | 
 88 |     Args:
 89 |         :param message: Developer's message on exception
 90 |         :param exception_object: Class object of the exception
 91 | 
 92 |     Returns: N/A
 93 |     """
 94 |     logging.error(exception_object.__str__())
 95 |     logging.error(traceback.format_exc())
 96 |     logging.exception(message)
 97 | 
 98 | 
 99 | def convert_iso_to_time_duration(iso_time_duration: str):
100 |     """
101 |     Converts ISO time duration to time in hours, minutes & seconds
102 | 
103 |     Args:
104 |         :param iso_time_duration: ISO time in string Example: PT1H, PT100M, PT2H5M
105 | 
106 |     Returns:
107 |         :return: Returns duration as datetime.timedelta type.
108 |                  Example: 01:00:00, 01:40:00, 02:05:00
109 |     """
110 |     if is_null_or_empty(iso_time_duration):
111 |         msg = f'Empty or Invalid time duration string {iso_time_duration}'
112 |         logging.error(msg)
113 |         return None
114 |     try:
115 |         return isodate.parse_duration(iso_time_duration)
116 |     except ISO8601Error as isoError:
117 |         msg = f"Error converting ISO time {iso_time_duration} to timedelta"
118 |         log_exception_details(message=msg, exception_object=isoError)
119 |         return None
120 | 
121 | 
122 | def add_iso_time_duration(time1: str, time2: str):
123 |     """
124 |     Adds two string time duration, first converts to timedelta then adds to return the result
125 |     Args:
126 |         :param time1: First time as string value
127 |         :param time2: Second time as string value
128 | 
129 |     Returns:
130 |         :return: time1 + time2 as datetime.timedelta type
131 |     """
132 |     if is_null_or_empty(time1) or is_null_or_empty(time2):
133 |         msg = f'Empty or Invalid time duration string time1 = {time1}, time2 = {time2}'
134 |         logging.error(msg)
135 |         return None
136 | 
137 |     try:
138 |         _time1 = convert_iso_to_time_duration(iso_time_duration=time1)
139 |         _time2 = convert_iso_to_time_duration(iso_time_duration=time2)
140 |         return isodate.duration_isoformat((_time1 + _time2))
141 |     except ISO8601Error as isoError:
142 |         msg = f"Error converting ISO time time1={time1} & time2={time2} to timedelta"
143 |         logging.error(msg)
144 |         log_exception_details(message=msg, exception_object=isoError)
145 |         return None
146 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/logging_util.py:
--------------------------------------------------------------------------------
  1 | import atexit
  2 | import logging
  3 | from logging.config import ConvertingList, ConvertingDict, valid_ident
  4 | from logging.handlers import QueueHandler, QueueListener
  5 | from queue import Queue
  6 | 
  7 | 
  8 | def _resolve_handlers(l):
  9 |     if not isinstance(l, ConvertingList):
 10 |         return l
 11 |     # Indexing the list performs the evaluation.
 12 |     return [l[i] for i in range(len(l))]
 13 | 
 14 | 
 15 | def _resolve_queue(q):
 16 |     if not isinstance(q, ConvertingDict):
 17 |         return q
 18 |     if '__resolved_value__' in q:
 19 |         return q['__resolved_value__']
 20 | 
 21 |     cname = q.pop('class')
 22 |     klass = q.configurator.resolve(cname)
 23 |     props = q.pop('.', None)
 24 |     kwargs = {k: q[k] for k in q if valid_ident(k)}
 25 |     result = klass(**kwargs)
 26 |     if props:
 27 |         for name, value in props.items():
 28 |             setattr(result, name, value)
 29 | 
 30 |     q['__resolved_value__'] = result
 31 |     return result
 32 | 
 33 | 
 34 | class QueueListenerHandler(QueueHandler):
 35 |     def __init__(self, handlers, respect_handler_level=False, auto_run=True, queue=Queue(-1)):
 36 |         queue = _resolve_queue(queue)
 37 |         super().__init__(queue)
 38 |         handlers = _resolve_handlers(handlers)
 39 |         self._listener = QueueListener(
 40 |             self.queue,
 41 |             *handlers,
 42 |             respect_handler_level=respect_handler_level)
 43 |         if auto_run:
 44 |             self.start()
 45 |             atexit.register(self.stop)
 46 | 
 47 |     def start(self):
 48 |         self._listener.start()
 49 | 
 50 |     def stop(self):
 51 |         self._listener.stop()
 52 | 
 53 |     # def emit(self, record):
 54 |     #     return super().emit(record)
 55 | 
 56 | 
 57 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
 58 | 
 59 | # These are the sequences need to get colored output
 60 | RESET_SEQ = "\033[0m"
 61 | COLOR_SEQ = "\033[0;%dm"
 62 | BOLD_SEQ = "\033[1m"
 63 | 
 64 | COLORS = {
 65 |     'WARNING': YELLOW,
 66 |     'INFO': GREEN,
 67 |     'DEBUG': MAGENTA,
 68 |     'CRITICAL': RED,
 69 |     'ERROR': RED
 70 | }
 71 | 
 72 | 
 73 | class ColoredFormatter(logging.Formatter):
 74 |     def __init__(self, msg, use_color=True):
 75 |         logging.Formatter.__init__(self, msg)
 76 |         self.use_color = use_color
 77 | 
 78 |     def format(self, record):
 79 |         if self.use_color and record.levelname in COLORS:
 80 |             # The background is set with 40 plus the number of the color, and the foreground with 30
 81 |             record.levelname = COLOR_SEQ % (30 + COLORS[record.levelname]) + record.levelname + RESET_SEQ
 82 |         return logging.Formatter.format(self, record)
 83 | 
 84 | 
 85 | def formatter_message(message, use_color=True):
 86 |     if use_color:
 87 |         message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
 88 |     else:
 89 |         message = message.replace("$RESET", "").replace("$BOLD", "")
 90 |     return message
 91 | 
 92 | 
 93 | class ColoredLogger(logging.Logger):
 94 |     # FORMAT = "$BOLD%(name)-20s$RESET][%(levelname)-18s]  %(message)s ($BOLD%(filename)s$RESET:%(lineno)d)"
 95 |     def __init__(self, name):
 96 |         logging.Logger.__init__(self, name, logging.DEBUG)
 97 |         self.FORMAT = '%(asctime)s %(name)-15s [$BOLD%(levelname)-10s$RESET] %(process)-10d %(funcName)-30s %(message)s'
 98 |         self.COLOR_FORMAT = formatter_message(self.FORMAT, True)
 99 |         color_formatter = ColoredFormatter(self.COLOR_FORMAT)
100 | 
101 |         console = logging.StreamHandler()
102 |         console.setFormatter(color_formatter)
103 | 
104 |         self.addHandler(console)
105 |         return
106 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/spark.py:
--------------------------------------------------------------------------------
  1 | import logging
  2 | from typing import List
  3 | 
  4 | from pyspark.sql import SparkSession, DataFrame
  5 | from pyspark.sql.functions import concat_ws, col, floor, rand
  6 | from pyspark.sql.types import StringType
  7 | 
  8 | from com.vitthalmirji.utils.helpers import log_exception_details, is_null_or_empty
  9 | 
 10 | 
 11 | def get_or_create_spark_session(need_hive_support: bool = False,
 12 |                                 spark_conf: List[dict] = [{'key': 'spark.app.name', 'value': ''}]) -> SparkSession:
 13 |     """
 14 |     Creates a spark session with given configuration in parameters
 15 | 
 16 |     Args:
 17 |         :param application_name: Name of the spark application
 18 |         :param spark_conf: Specific Spark Configurations at user level (default is None)
 19 |         :param need_hive_support: Enable Hive support in spark session? (default is False)
 20 | 
 21 |     Returns:
 22 |         An object of SparkSession
 23 | 
 24 |     Exceptions:
 25 |         Throws any exception on to calling function that has encountered during creating SparkSession
 26 |         :exception type of exception is broader, this can be improvised to handle more specific exceptions
 27 |     """
 28 |     spark: SparkSession
 29 |     try:
 30 |         spark: SparkSession = SparkSession.getActiveSession()
 31 |         if spark:
 32 |             logging.warning("Returning active spark session")
 33 |             return spark
 34 | 
 35 |         logging.warning(f"Creating spark session first time with configs {spark_conf}")
 36 | 
 37 |         if need_hive_support:
 38 |             spark = SparkSession.builder \
 39 |                 .enableHiveSupport() \
 40 |                 .getOrCreate()
 41 |         else:
 42 |             spark = SparkSession.builder \
 43 |                 .getOrCreate()
 44 | 
 45 |         for conf in list(spark_conf):
 46 |             spark.conf.set(**conf)
 47 | 
 48 |         logging.warning(f"Executor cores = {spark.conf.get('spark.executor.cores', 'Not set')}")
 49 |         logging.warning(f"Num Executors = {spark.conf.get('spark.executor.instances', 'Not set')}")
 50 |         return spark
 51 |     except Exception as ex:
 52 |         log_exception_details(message="Error creating spark session", exception_object=ex)
 53 |         raise ex
 54 | 
 55 | 
 56 | def read_data_as_spark_dataframe(filetype: str, location: str, options={}, table_name=None) -> DataFrame:
 57 |     """
 58 |     Reads various kind of files & tables in spark
 59 |     Args:
 60 |         :param filetype:
 61 |         :param location:
 62 |         :param options:
 63 |         :param table_name:
 64 | 
 65 |     Returns:
 66 |         :return: A DataFrame object
 67 | 
 68 |     Exception:
 69 |         Throws any exception that is encountered during file / table read in spark
 70 |         :exception type of exception is broader, this can be improvised to handle more specific exceptions
 71 |     """
 72 |     logging.warning(f"Attempting to read {filetype} in spark using configs {options} from location {location}")
 73 |     spark = get_or_create_spark_session()
 74 |     try:
 75 |         if str(filetype).lower().__eq__('table'):
 76 |             if is_null_or_empty(table_name) is not None:
 77 |                 try:
 78 |                     _ = spark.read.options(**options).table(table_name)
 79 |                 except Exception as ex:
 80 |                     log_exception_details(message=f"Error reading table {table_name}", exception_object=ex)
 81 |                     raise ex
 82 |             else:
 83 |                 print(f"Invalid table {table_name} -Table do not exist in SQL Context: ")
 84 |         elif str(filetype).lower().__eq__('text'):
 85 |             logging.warning(
 86 |                 "Lines will be read from the text file and dataframe will have single column by name 'line'")
 87 |             return spark.read.options(**options).text(paths=location).toDF('line')
 88 |         elif str(filetype).lower().__eq__('csv'):
 89 |             return spark.read.options(**options).csv(path=location)
 90 |         elif str(filetype).lower().__eq__('xml'):
 91 |             return spark.read.format('com.databricks.spark.xml').options(**options).load(path=location)
 92 |         elif str(filetype).lower().__eq__('json'):
 93 |             return spark.read.options(**options).json(path=location)
 94 |         elif str(filetype).lower().__eq__('orc'):
 95 |             return spark.read.options(**options).orc(location)
 96 |         elif str(filetype).lower().__eq__('parquet'):
 97 |             return spark.read.options(**options).parquet(location)
 98 |         else:
 99 |             raise Exception(f"Invalid filetype: {filetype}")
100 |     except Exception as ex:
101 |         log_exception_details(message=f"Error reading file in Spark of filetype {filetype}", exception_object=ex)
102 |         raise ex
103 | 
104 | 
105 | def revise_shuffle_partitions(multiplier: int = 1):
106 |     """
107 |     Sets the shuffle partition to total number of cores across all executors
108 |     Useful in dataframe operations using spark
109 |     :param multiplier: In case of stage failures increase the multiplier
110 |     :return: N/A
111 |     """
112 |     spark = get_or_create_spark_session()
113 |     num_executors = int(spark.conf.get('spark.executor.instances', '2').strip())
114 |     num_cores = int(spark.conf.get('spark.executors.cores', '1').strip())
115 |     revised_shuffle_partition = num_executors * num_cores * multiplier
116 |     spark.conf.set('spark.sql.shuffle.partitions', f"{revised_shuffle_partition}")
117 | 
118 | 
119 | def data_frame_repartition(df: DataFrame, num_files: int = None, use_coalesce=False, repartition_columns=None):
120 |     """
121 |     Function to repartition data for better performance.
122 |     Majorly has 2 types: #1 - coalesce: to narrow down files in output; #2 - repartition: to uniformly distribute data in output
123 |     Note: This involves shuffling (wide transformation)
124 |     Args:
125 |         :param df: Dataframe on which repartition (wide transformation) to be performed
126 |         :param num_files: Number of output files required
127 |         :param use_coalesce: Use this to narrow down the number of files irrespective of any columns default is False
128 |         :param repartition_columns: Columns on which repartition to be performed
129 |                                     Most important note: Columns specified here must & should be low cardinality values in table
130 |     Returns:
131 |         :return: Dataframe with repartition or coalesce transformation applied
132 |     """
133 |     if use_coalesce:
134 |         return df.coalesce(num_files)
135 | 
136 |     columns_list = list(map(lambda column: col(column).cast(StringType()),
137 |                             repartition_columns)) if repartition_columns is not None else []
138 | 
139 |     if num_files is None and len(columns_list) > 0:
140 |         return df.repartition(*columns_list)
141 | 
142 |     salting_column = floor(rand() * num_files)
143 |     temp_repartition_column = 'temp_repartition_column'
144 |     return df.withColumn(
145 |         temp_repartition_column,
146 |         concat_ws('~', *columns_list, salting_column)
147 |     ).repartition(temp_repartition_column).drop(temp_repartition_column)
148 | 
149 | 
150 | def standardize_and_rename_df_columns(df: DataFrame, column_names_to_rename: dict):
151 |     """
152 |     Performs renaming column names on given dataframe:
153 |     Trims if column name has leading & trailing whitespaces
154 |     For given dictionary of columns renames according to specified name
155 |     Args:
156 |         :param df: DataFrame for renaming columns
157 |         :param column_names_to_rename: dictionary having existing column name & revised / renaming column name
158 | 
159 |     Returns:
160 |         :return: _df transformed dataframe with column names renamed
161 | 
162 |     Exceptions:
163 |         :exception Throws exception that's encountered during renaming column on dataframe
164 |     """
165 |     _df = df
166 |     try:
167 |         # Trim and lowercase all column names
168 |         for column_name in filter(lambda c: not column_names_to_rename.keys().__contains__(c), df.columns):
169 |             _df = _df.withColumnRenamed(column_name, column_name.strip().lower())
170 | 
171 |         for column_name, revised_column_name in column_names_to_rename.items():
172 |             _df = _df.withColumnRenamed(column_name, revised_column_name)
173 |         return _df
174 |     except Exception as ex:
175 |         log_exception_details(message=f"Error renaming columns on given dataframe {column_names_to_rename}",
176 |                               exception_object=ex)
177 |         raise ex
178 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/spark_submit_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import logging.config
 3 | from datetime import datetime
 4 | 
 5 | from com.vitthalmirji.utils.Utilities import get_dates_between_range
 6 | 
 7 | START_TIME = datetime.now().isoformat().__str__()
 8 | 
 9 | 
10 | class StaticConfigParameterNotFound(Exception):
11 |     pass
12 | 
13 | 
14 | def sort_spark_submit_options(command_options):
15 |     sorted_command_options = sorted(command_options, key=lambda k: k[0])
16 |     return sorted_command_options
17 | 
18 | 
19 | def update_conf(command_options):
20 |     _conf = {k: v for k, v in command_options.items() if k == '--conf'}['--conf']['value']
21 |     _conf = get_spark_conf_key_value_as_string(_conf)
22 |     _conf = f"\"{_conf}\""
23 |     command_options['--conf'].update({'value': _conf})
24 | 
25 |     return command_options
26 | 
27 | 
28 | def get_class_arguments_as_string(command):
29 |     return ' \\\n'.join(list(map(lambda c: f"{c}={command['--class_arguments']['value'][c]}",
30 |                                  command['--class_arguments']['value'])))
31 | 
32 | 
33 | def get_spark_conf_key_value_as_string(conf):
34 |     return f"""{",".join([f"{d}={conf[d]}" for d in conf])}\""""
35 | 
36 | 
37 | def static_config_args_sanity_check(command, config):
38 |     for cmd in command:
39 |         if config.get(cmd) is None and command[cmd]['required'] is True:
40 |             logging.error(f"Configuration file do not have required spark-submit option {cmd}")
41 |             raise StaticConfigParameterNotFound(
42 |                 f"ERROR: Configuration file do not have required spark-submit option {cmd}")
43 |         elif config.get(cmd) is not None:
44 |             command[cmd].update({'value': config.get(cmd)})
45 |         else:
46 |             continue
47 |     return command
48 | 
49 | 
50 | def update_spark_submit_option_values(runtime_args, config_args, command):
51 |     config_args['default']['--conf'].update(config_args[runtime_args['workflow']]['spark_conf'])
52 |     config_args['default']['--name'] = f"\"{runtime_args['workflow']}\""
53 |     command['--class_arguments']['value'].update(runtime_args)
54 |     return config_args, command
55 | 
56 | 
57 | def prepare_spark_submit(runtime_args, config_args, app_config):
58 |     command = app_config['spark_submit_options_order']
59 |     _config_args, command = update_spark_submit_option_values(runtime_args, config_args, command)
60 |     _config_args = _config_args['default']
61 | 
62 |     command = static_config_args_sanity_check(command, _config_args)
63 | 
64 |     command_date_ranges = get_dates_between_range(refresh_type=runtime_args['refreshType'],
65 |                                                   start_date=runtime_args['startDate'],
66 |                                                   end_date=runtime_args['endDate'],
67 |                                                   interval_in_days=app_config['default_settings'][
68 |                                                       'history_load_interval_in_days'],
69 |                                                   date_pattern='%Y-%m-%d')
70 |     logging.debug(f"Date Range = {command_date_ranges}")
71 |     command = update_conf(command_options=command)
72 |     spark_submit_command = ' \\\n'.join(f"{k} {v['value']}" for k, v in command.items() if k != '--class_arguments')
73 | 
74 |     logging.debug(command)
75 |     command_list = []
76 |     for d in command_date_ranges:
77 |         command['--class_arguments']['value'].update(d)
78 |         class_args = get_class_arguments_as_string(command)
79 |         command_list.append(f"{spark_submit_command}\n{class_args}")
80 |     return command_list
81 | 


--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/transformation_extension.py:
--------------------------------------------------------------------------------
 1 | from pyspark.rdd import RDD
 2 | from pyspark.sql.dataframe import DataFrame
 3 | 
 4 | 
 5 | def transform(self, f):
 6 |     return f(self)
 7 | 
 8 | 
 9 | RDD.transform = transform
10 | DataFrame.transform = transform
11 | 


--------------------------------------------------------------------------------
/tests/EtlTransformTest.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | import unittest
 3 | 
 4 | from etl import Transform
 5 | from etl.meta import MetaModel
 6 | from utils.Utilities import SparkSettings
 7 | 
 8 | start_time = time.time()
 9 | 
10 | 
11 | class MyTestCase(unittest.TestCase):
12 |     def testEtlTransformations(self):
13 |         self.spark = SparkSettings("EtlTransformTest").getSparkSession()
14 |         metamodel = MetaModel(datamodelpath='resources/datamodel.csv', sc=self.spark)
15 | 
16 |         # print(f"Data model as JSON -> \n{metamodel.datamodel}")
17 | 
18 |         metamodel.readMetadataFromCsv(sc=self.spark, metadatapath='resources/meta.csv', targettable='invoice')
19 |         metamodel.readSourceFilesIntoDF()
20 | 
21 |         targetddl = metamodel.getTargetDdl('PARQUET', True)
22 |         # print('------Target DDL ------')
23 |         # print('------Target Query ------')
24 |         # print(f"{queryhead} {querytail}")
25 | 
26 |         # self.spark.sql(f"{queryhead} {querytail}").show()
27 | 
28 |         trans = Transform(targettable='invoice', model=metamodel, sc=self.spark)
29 | 
30 |         trans.transform()
31 |         self.assertIsNotNone(trans)
32 | 
33 | 
34 | if __name__ == '__main__':
35 |     unittest.main()
36 | 


--------------------------------------------------------------------------------
/tests/UtilsTest.py:
--------------------------------------------------------------------------------
 1 | import getpass
 2 | import unittest
 3 | 
 4 | from utils.Utilities import create_spark_session, count_words, split_words
 5 | 
 6 | 
 7 | class UtilsTest(unittest.TestCase):
 8 | 
 9 |     def test1_testSparkSettings(self):
10 |         print("Testing Spark Settings")
11 |         self.spark = create_spark_session(application_name="Utils Test")
12 |         self.assertEqual(str(self.spark.version), "2.4.5")
13 |         self.assertEqual(str(self.spark.sparkContext.sparkUser()), getpass.getuser())
14 | 
15 |         metadf: DataFrame = self.spark.read.option("header", "true").format("csv").load(path="resources/meta.csv")
16 | 
17 |         self.assertEqual(True, True)
18 | 
19 |     def test2_test_custom_transformations(self):
20 |         print("Testing Environment")
21 |         self.spark = create_spark_session(application_name="Utils Test")
22 |         line_array = ["Hello,World,How,are,you", "Hello.World.How.are.you", "Hello;World;How;are;you",
23 |                       "Hello-World-How-are-you", "Hello|World|How|are|you", "Hello World How are you"]
24 | 
25 |         lines_rdd: RDD[str] = self.spark.sparkContext.parallelize(line_array)
26 |         df = lines_rdd.transform(lambda _rdd: split_words(_rdd)).transform(lambda _rdd: count_words(_rdd))
27 |         df.toDF().toDF("Word", "Count").show()
28 | 
29 |         self.assertTrue(df is not None)
30 |         self.assertEqual(df.count(), 5)
31 | 
32 | 
33 | if __name__ == '__main__':
34 |     unittest.main()
35 | 


--------------------------------------------------------------------------------
/tests/XmlMapperTest.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from com.vitthalmirji.imports.HdfsImport import HdfsImport
 4 | from com.vitthalmirji.mapper.Mapper import ComplexDataMapper
 5 | 
 6 | 
 7 | class XmlMapperTest(unittest.TestCase):
 8 |     def test_create_hive_ql_for_nested_data_explode(self):
 9 |         print("Testing HdfsImport readFromSource")
10 |         self.sparksettings = SparkSettings("XmlMapperTest")
11 |         self.spark = self.sparksettings.getSparkSession()
12 |         self.hdfsImport = HdfsImport(self.spark)
13 | 
14 |         # Read JSON file from given path
15 |         json_df = self.spark.read.json(path='resources/clinical_trial/*.xml')
16 | 
17 |         json_df.printSchema()
18 | 
19 |         # Register as temporary view / table for flattening queries to execute on
20 |         json_df.createOrReplaceTempView('jsontable')
21 | 
22 |         # self.spark.range(10).select(monotonically_increasing_id()).show()
23 |         # self.spark.range(10).select(monotonically_increasing_id()).coalesce(1).show()
24 |         # self.spark.range(10).repartition(5).select(monotonically_increasing_id()).coalesce(1).show()
25 | 
26 |         # Create an object of class XmlMapper from Mapper.py by passing spark variable
27 |         xml_mapper: ComplexDataMapper = ComplexDataMapper(sc=self.spark)
28 | 
29 |         # Call createViews function by passing json_df dataframe, it returns 2 things flattening queries and XPATH (
30 |         # Only for XML; Ignore for JSON)
31 |         view_queries = xml_mapper.createViews(df=json_df, root_table_name='jsontable',
32 |                                               columns_cascade_to_leaf_level_with_alias=[
33 |                                                   'item.organizationId AS pk_organizationId'])
34 | 
35 |         # Loop through all queries, execute them, physicalize flattened attributes as table - Repeat steps to all
36 |         # queries (Nested attributes)
37 |         for q in view_queries[0]:
38 |             print(f'{q}:' f'{view_queries[0][q]}')
39 |             temp_df = self.spark.sql(view_queries[0][q])
40 |             temp_df.rdd.zipWithUniqueId().toDF().printSchema()
41 |             temp_df.createOrReplaceTempView(q)
42 |             select_cols = []
43 |             for col in temp_df.schema.fields:
44 |                 if not str(col.dataType).lower().startswith("struct") and not str(col.dataType).lower().startswith(
45 |                         "array"):
46 |                     select_cols.append(col.name)
47 |             print(f"Total partitions = {temp_df.rdd.getNumPartitions()}")
48 |             temp_df.select(select_cols).show()
49 | 
50 | 
51 | if __name__ == '__main__':
52 |     unittest.main()
53 | 


--------------------------------------------------------------------------------
/tests/aws_test/AwsS3Test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import signal
  3 | import subprocess
  4 | import unittest
  5 | 
  6 | import boto3
  7 | from pyspark.sql.dataframe import DataFrame
  8 | from pyspark.sql.session import SparkSession
  9 | 
 10 | from utils.Utilities import list_s3_files
 11 | 
 12 | 
 13 | class AwsS3Test(unittest.TestCase):
 14 |     @classmethod
 15 |     def setUpClass(cls) -> None:
 16 |         # create an s3 connection that points to the moto server.
 17 |         cls.s3_resource_obj = boto3.resource(
 18 |             "s3",
 19 |             endpoint_url="http://127.0.0.1:5000"
 20 |         )
 21 | 
 22 |         cls.s3_client_obj = boto3.client(
 23 |             "s3",
 24 |             endpoint_url="http://127.0.0.1:5000"
 25 |         )
 26 |         # start moto server, by default it runs on localhost on port 5000.
 27 |         cls.process = subprocess.Popen(
 28 |             ['moto_server', 's3'],
 29 |             stdout=subprocess.PIPE,
 30 |             shell=True,
 31 |             creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
 32 |         )
 33 | 
 34 |         # create an S3 bucket.
 35 |         cls.s3_resource_obj.create_bucket(Bucket="bucket")
 36 | 
 37 |         # # configure pyspark to use hadoop-aws module. os.environ[ "PYSPARK_SUBMIT_ARGS" ] = '--packages
 38 |         # "org.apache.hadoop:hadoop-aws:2.7.3" --packages "org.apache.httpcomponents:httpclient:4.2.5" ' \
 39 |         # '--packages "org.xerial.snappy:snappy-java:1.1.7.3" pyspark-shell '
 40 | 
 41 |         # get the spark session object and hadoop configuration.
 42 |         cls.spark: SparkSession = SparkSession.builder.getOrCreate()
 43 |         cls.hadoop_conf = cls.spark.sparkContext._jsc.hadoopConfiguration()
 44 |         # mock the aws credentials to access s3.
 45 |         cls.hadoop_conf.set("fs.s3a.access.key", "dummy-value")
 46 |         cls.hadoop_conf.set("fs.s3a.secret.key", "dummy-value")
 47 |         # we point s3a to our moto server.
 48 |         cls.hadoop_conf.set("fs.s3a.endpoint", "http://127.0.0.1:5000")
 49 |         # we need to configure hadoop to use s3a.
 50 |         cls.hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 51 | 
 52 |     @classmethod
 53 |     def test_dataframe_operation_s3(cls):
 54 |         # create a pyspark dataframe.
 55 |         values = [("k1", 1), ("k2", 2)]
 56 |         columns = ["key", "value"]
 57 |         df = cls.spark.createDataFrame(values, columns)
 58 |         # write the dataframe as csv to s3.
 59 |         df.write.mode('overwrite').csv("s3://bucket/source.csv")
 60 |         # read the dataset from s3
 61 |         df = cls.spark.read.csv("s3://bucket/source.csv")
 62 |         # print Data
 63 |         df.show()
 64 |         # assert df is a DataFrame
 65 |         assert isinstance(df, DataFrame)
 66 | 
 67 |         print("test_s3_glue_jobs_locally successfully completed")
 68 | 
 69 |     @classmethod
 70 |     def test_3_create_directory_files_s3(cls):
 71 |         some_binary_data = b'Here we have some data'
 72 | 
 73 |         cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1" + '/'))
 74 |         cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/dir1.txt')
 75 | 
 76 |         cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1/subdir1" + '/'))
 77 |         cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/subdir1/dir1_subdir1.txt')
 78 | 
 79 |         cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1/subdir2" + '/'))
 80 |         cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/subdi2/dir1_subdir2.txt')
 81 | 
 82 |         cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/")
 83 |         cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/dir2.txt')
 84 | 
 85 |         cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/subdir1/")
 86 |         cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/subdir1/dir2_subdir1.txt')
 87 | 
 88 |         cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/subdir2/")
 89 |         cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/subdir2/dir2_subdir2.txt')
 90 | 
 91 |         contents = list_s3_files(opt={'Bucket': 'bucket'})
 92 |         print(contents)
 93 |         contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True)
 94 |         print(contents)
 95 |         contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True,
 96 |                                  file_extension='.csv')
 97 |         print(contents)
 98 |         contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True,
 99 |                                  file_extension='.xml')
100 |         print(contents)
101 | 
102 |     @classmethod
103 |     def tearDownClass(cls) -> None:
104 |         # shut down the moto server.
105 |         os.kill(cls.process.pid, signal.SIGTERM)
106 | 
107 | 
108 | if __name__ == '__main__':
109 |     unittest.main()
110 | 


--------------------------------------------------------------------------------
/tests/aws_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/tests/aws_test/__init__.py


--------------------------------------------------------------------------------
/tests/aws_test/glue_job.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | 
 3 | from pyspark.context import SparkContext
 4 | 
 5 | 
 6 | # https://github.com/aws-samples/aws-glue-samples/tree/master/examples
 7 | 
 8 | def run(cli_args, spark):
 9 |     # init glue pyspark job
10 |     glue_args = _get_glue_args(cli_args=cli_args)
11 |     spark_session, job = _get_spark_session_and_glue_job(glue_args)
12 | 
13 |     # run glue job code
14 |     source = cli_args["source"]
15 |     destination = cli_args["destination"]
16 |     df = spark.read.csv(source)
17 |     df.write.csv(destination)
18 | 
19 |     # commit job
20 |     _commit_job(job)
21 | 
22 | 
23 | def _get_spark_session_and_glue_job(glue_args):
24 |     from awsglue.context import GlueContext
25 |     from awsglue.job import Job
26 | 
27 |     sc = SparkContext.getOrCreate()
28 |     glue_context = GlueContext(sparkContext=sc)
29 |     job = Job(glue_context=glue_context)
30 |     job.init(glue_args["JOB_NAME"], glue_args)
31 |     return glue_context.spark_session, job
32 | 
33 | 
34 | def _commit_job(job):
35 |     job.commit()
36 | 
37 | 
38 | def _get_glue_args(cli_args):
39 |     from awsglue.utils import getResolvedOptions
40 |     glue_args = getResolvedOptions(args=cli_args, options=["JOB_NAME", "source", "destination"])
41 |     print(glue_args)
42 |     return glue_args
43 | 
44 | 
45 | if __name__ == "__main__":
46 |     run(["source", "destination"])
47 | 


--------------------------------------------------------------------------------
/tests/aws_test/test_glue_job.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import signal
  3 | import subprocess
  4 | import unittest
  5 | from unittest import mock
  6 | 
  7 | import boto3
  8 | from pyspark.sql import SparkSession
  9 | 
 10 | from aws_test import glue_job
 11 | from utils.Utilities import delete_s3_bucket
 12 | 
 13 | 
 14 | class TestGlueJob(unittest.TestCase):
 15 |     """
 16 |     This test class setup a test environment to test our glue job,
 17 |     runs the glue job and checks the result.
 18 |     """
 19 | 
 20 |     @classmethod
 21 |     def setUpClass(cls):
 22 |         """
 23 |         the setup class starts a moto server, creates an S3 bucket,
 24 |         configures PySpark and Spark and dumps the source dataframe to S3.
 25 |         """
 26 |         S3_MOCK_ENDPOINT = "http://127.0.0.1:5000"
 27 | 
 28 |         # setup moto server
 29 |         # cls.process = subprocess.Popen(
 30 |         # "moto_server s3", stdout=subprocess.PIPE,
 31 |         # shell=True, preexec_fn=os.setsid()
 32 |         # )
 33 | 
 34 |         os.environ['AWS_ACCESS_KEY_ID'] = 'test'
 35 |         os.environ['AWS_SECRET_ACCESS_KEY'] = 'test'
 36 | 
 37 |         cls.process = subprocess.Popen(
 38 |             ['moto_server', 's3'],
 39 |             stdout=subprocess.PIPE,
 40 |             shell=True,
 41 |             creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
 42 |         )
 43 | 
 44 |         # create s3 connection, bucket and s3 url's
 45 |         cls.s3_conn = boto3.resource(
 46 |             "s3", region_name="eu-central-1",
 47 |             endpoint_url=S3_MOCK_ENDPOINT
 48 |         )
 49 |         bucket = "bucket"
 50 |         delete_s3_bucket(bucket)
 51 |         cls.s3_conn.create_bucket(Bucket=bucket)
 52 |         cls.s3_source = "s3://{}/{}".format(bucket, "source.csv")
 53 |         cls.s3_destination = "s3://{}/{}".format(bucket, "destination.csv")
 54 | 
 55 |         # Setup spark to use s3, and point it to the moto server.
 56 |         os.environ[
 57 |             "PYSPARK_SUBMIT_ARGS"
 58 |         ] = """--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell"""
 59 |         cls.spark = SparkSession.builder.getOrCreate()
 60 |         hadoop_conf = cls.spark.sparkContext._jsc.hadoopConfiguration()
 61 |         hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
 62 |         hadoop_conf.set("fs.s3a.access.key", "mock")
 63 |         hadoop_conf.set("fs.s3a.secret.key", "mock")
 64 |         hadoop_conf.set("fs.s3a.endpoint", S3_MOCK_ENDPOINT)
 65 | 
 66 |         # create source dataframe and write the dataframe as csv to s3
 67 |         values = [("k1", 1), ("k2", 2)]
 68 |         columns = ["key", "value"]
 69 |         df = cls.spark.createDataFrame(values, columns)
 70 |         df.write.csv(cls.s3_source)
 71 | 
 72 |     @mock.patch("glue_job._commit_job")
 73 |     @mock.patch("glue_job._get_glue_args")
 74 |     @mock.patch("glue_job._get_spark_session_and_glue_job")
 75 |     def test_glue_job_runs_successfully(self, m_session_job, m_get_glue_args, m_commit):
 76 |         """
 77 |         we arrange our test function; construct the arguments that we get from the cli, set the return
 78 |         values of our mocked functions.
 79 |         we run our glue job and assert if the result is what we expect.
 80 |         """
 81 |         # arrange
 82 |         cli_args = {"--JOBNAME": 'TestGlueLocal', "--source": self.s3_source, "--destination": self.s3_destination}
 83 | 
 84 |         m_session_job.return_value = self.spark, None
 85 |         m_get_glue_args.return_value = cli_args
 86 | 
 87 |         # act
 88 |         glue_job.run(cli_args=cli_args, spark=self.spark)
 89 | 
 90 |         # assert
 91 |         df = self.spark.read.csv(self.s3_destination)
 92 |         self.assertTrue(not df.rdd.isEmpty())
 93 | 
 94 |     @classmethod
 95 |     def tearDownClass(cls):
 96 |         # shut down moto server
 97 |         os.killpg(os.getpgid(cls.process.pid), signal.SIGTERM)
 98 | 
 99 | 
100 | if __name__ == "__main__":
101 |     try:
102 |         unittest.main()
103 |     except Exception:
104 |         TestGlueJob().tearDownClass()
105 | 


--------------------------------------------------------------------------------
/tests/aws_test/test_mocked_postgres.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import sqlalchemy
 4 | from testcontainers.postgres import PostgresContainer
 5 | 
 6 | 
 7 | class MockedPostgresTest(unittest.TestCase):
 8 |     @classmethod
 9 |     def test_docker_run_postgress(cls) -> None:
10 |         postgres_container = PostgresContainer("postgres:9.5")
11 |         with postgres_container as postgres:
12 |             e = sqlalchemy.create_engine(postgres.get_connection_url())
13 |             result = e.execute("SELECT version()")
14 | 
15 |     @classmethod
16 |     def tearDownClass(cls) -> None:
17 |         print('Done')
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     try:
22 |         unittest.main()
23 |     except Exception:
24 |         MockedPostgresTest().tearDownClass()
25 | 


--------------------------------------------------------------------------------
/tests/aws_test/testing_mocked_s3.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import signal
 3 | import subprocess
 4 | import unittest
 5 | 
 6 | import boto3
 7 | from pyspark.sql import DataFrame
 8 | from pyspark.sql import SparkSession
 9 | 
10 | 
11 | class MockTestGlueJob(unittest.TestCase):
12 |     # start moto server, by default it runs on localhost on port 5000.
13 |     process = subprocess.Popen(
14 |         ['moto_server', 's3'],
15 |         stdout=subprocess.PIPE,
16 |         shell=True,
17 |         creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
18 |     )
19 | 
20 |     @classmethod
21 |     def setUpClass(cls) -> None:
22 |         # create an s3 connection that points to the moto server.
23 |         s3_conn = boto3.resource(
24 |             "s3", endpoint_url="http://127.0.0.1:5000"
25 |         )
26 |         # create an S3 bucket.
27 |         s3_conn.create_bucket(Bucket="bucket")
28 |         # # configure pyspark to use hadoop-aws module. os.environ[ "PYSPARK_SUBMIT_ARGS" ] = '--packages
29 |         # "org.apache.hadoop:hadoop-aws:2.7.3" --packages "org.apache.httpcomponents:httpclient:4.2.5" ' \
30 |         # '--packages "org.xerial.snappy:snappy-java:1.1.7.3" pyspark-shell '
31 | 
32 |     def test_s3_glue_jobs_locally(self):
33 |         # get the spark session object and hadoop configuration.
34 |         spark = SparkSession.builder.getOrCreate()
35 |         hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
36 |         # mock the aws credentials to access s3.
37 |         hadoop_conf.set("fs.s3a.access.key", "dummy-value")
38 |         hadoop_conf.set("fs.s3a.secret.key", "dummy-value")
39 |         # we point s3a to our moto server.
40 |         hadoop_conf.set("fs.s3a.endpoint", "http://127.0.0.1:5000")
41 |         # we need to configure hadoop to use s3a.
42 |         hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
43 |         # create a pyspark dataframe.
44 |         values = [("k1", 1), ("k2", 2)]
45 |         columns = ["key", "value"]
46 |         df = spark.createDataFrame(values, columns)
47 |         # write the dataframe as csv to s3.
48 |         df.write.mode('overwrite').csv("s3://bucket/source.csv")
49 |         # read the dataset from s3
50 |         df = spark.read.csv("s3://bucket/source.csv")
51 |         # print Data
52 |         df.show()
53 |         # assert df is a DataFrame
54 |         assert isinstance(df, DataFrame)
55 |         print("test_s3_glue_jobs_locally successfully completed")
56 | 
57 |     @classmethod
58 |     def tearDownClass(cls) -> None:
59 |         # shut down the moto server.
60 |         os.kill(cls.process.pid, signal.SIGTERM)
61 | 
62 | 
63 | if __name__ == "__main__":
64 |     try:
65 |         unittest.main()
66 |     except Exception:
67 |         MockTestGlueJob().tearDownClass()
68 | 


--------------------------------------------------------------------------------
/tests/resources/config.yml:
--------------------------------------------------------------------------------
1 | # required to connect to redshift
2 | host: my.redshift.cluster.com
3 | port: 5439
4 | database: db
5 | user: userid
6 | password: password
7 | ## optional extras for the dbapi connector
8 | sslmode: require
9 | another_option: 123


--------------------------------------------------------------------------------
/tests/resources/datamodel.csv:
--------------------------------------------------------------------------------
1 | table           ,pk       ,fk_table,fk_col   ,fk_table_jointype
2 | xmltable        ,_id      ,        ,         ,
3 | carbank_xmltable,_secid   ,xmltable,_id      ,
4 | species_xmltable,col      ,xmltable,_id      ,
5 | red_xmltable    ,MedlineID,xmltable,_id      ,
6 | product         ,id       ,purchase,productid,LEFT
7 | purchase        ,id       ,        ,         ,
8 | store           ,id       ,purchase,storeid  ,INNER
9 | 


--------------------------------------------------------------------------------
/tests/resources/meta.csv:
--------------------------------------------------------------------------------
 1 | key                       ,src_system,source_desc,src_database,src_table       ,src_filetype,src_file_path         ,src_table_description,src_col       ,src_col_description,src_col_datatype,key_constraints,src_col_filter,src_col_aggregator,src_col_aggregator_filter,check_column,mode,udf  ,udfarguments,target_database,src_table_order,target_col       ,target_col_filter,target_col_aggregator,target_col_aggregator_filter,target_table     ,target_file_path,target_col_datatype,access_limitation,nullable,comment
 2 | carbank_xmltable-_sec_id  ,          ,           ,            ,carbank_xmltable,tbl         ,                      ,                     ,_sec_id       ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,sec_id           ,                 ,                     ,                            ,transformxmltable,                ,string             ,                 ,        ,
 3 | xmltable-_id              ,          ,           ,            ,xmltable        ,xml         ,                      ,                     ,_id           ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,id               ,                 ,                     ,                            ,transformxmltable,                ,string             ,                 ,        ,
 4 | xmltable-_mtype           ,          ,           ,            ,xmltable        ,xml         ,                      ,                     ,_mtype        ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,mtype            ,                 ,                     ,                            ,transformxmltable,                ,string             ,                 ,        ,
 5 | xmltable-_seqlen          ,          ,           ,            ,xmltable        ,xml         ,                      ,                     ,_seqlen       ,                   ,bigint          ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,seqlen           ,                 ,                     ,                            ,transformxmltable,                ,bigint             ,                 ,        ,
 6 | species_xmltable-col      ,          ,           ,            ,species_xmltable,tbl         ,                      ,                     ,col           ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,species          ,                 ,                     ,                            ,transformxmltable,                ,string             ,                 ,        ,
 7 | red_xmltable-MedlineID    ,          ,           ,            ,red_xmltable    ,tbl         ,                      ,                     ,MedlineID     ,                   ,bigint          ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,MedlineID        ,                 ,                     ,                            ,transformxmltable,                ,bigint             ,                 ,        ,
 8 | product-name              ,          ,           ,            ,product         ,csv         ,resources/product.csv ,                     ,name          ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,nvl  ,-           ,               ,0              ,name             ,                 ,                     ,                            ,invoice          ,                ,string             ,                 ,        ,
 9 | purchase-purchasedate     ,          ,           ,            ,purchase        ,csv         ,resources/purchase.csv,                     ,purchasedate  ,                   ,string          ,pk             ,              ,                  ,                         ,            ,    ,nvl  ,1/1/1900    ,               ,0              ,purchasedate     ,                 ,                     ,                            ,invoice          ,                ,string             ,                 ,        ,
10 | store-name                ,          ,           ,            ,store           ,csv         ,resources/store.csv   ,                     ,name          ,                   ,string          ,               ,eq('Dadar')   ,                  ,                         ,            ,    ,nvl  ,-           ,               ,0              ,storename        ,                 ,                     ,                            ,invoice          ,                ,string             ,                 ,        ,
11 | product-name              ,          ,           ,            ,product         ,csv         ,resources/product.csv ,                     ,name          ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,nvl  ,-           ,               ,1              ,name             ,                 ,                     ,                            ,salesummary      ,                ,string             ,                 ,        ,
12 | purchase-id               ,          ,           ,            ,purchase        ,csv         ,resources/purchase.csv,                     ,id            ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,count,            ,               ,1              ,totalsolditems   ,                 ,                     ,                            ,salesummary      ,                ,string             ,                 ,        ,
13 | store-name                ,          ,           ,            ,store           ,csv         ,resources/store.csv   ,                     ,name          ,                   ,string          ,               ,              ,                  ,                         ,            ,    ,nvl  ,-           ,               ,1              ,storename        ,                 ,                     ,                            ,salesummary      ,                ,string             ,                 ,        ,
14 | salesummary-storename     ,          ,           ,            ,salesummary     ,            ,                      ,                     ,storename     ,                   ,                ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,productname      ,                 ,                     ,                            ,salesummary      ,                ,string             ,                 ,        ,
15 | salesummary-name          ,          ,           ,            ,salesummary     ,            ,                      ,                     ,name          ,                   ,                ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,totalsoldproducts,                 ,                     ,                            ,salesummary      ,                ,string             ,                 ,        ,
16 | salesummary-totalsolditems,          ,           ,            ,salesummary     ,            ,                      ,                     ,totalsolditems,                   ,                ,               ,              ,                  ,                         ,            ,    ,     ,            ,               ,0              ,storename        ,                 ,                     ,                            ,salesummary      ,                ,string             ,                 ,        ,
17 | 


--------------------------------------------------------------------------------
/tests/resources/mock_dataframe.txt:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,x,2011-01-01
3 | 2,y,2001-04-02


--------------------------------------------------------------------------------
/tests/resources/product.csv:
--------------------------------------------------------------------------------
1 | id,name       ,price
2 | 1 ,Wrist Watch,10
3 | 2 ,Shoes      ,8
4 | 3 ,Tshirt     ,5
5 | 4 ,Jeans      ,7
6 | 5 ,Sunglasses ,7
7 | 


--------------------------------------------------------------------------------
/tests/resources/purchase.csv:
--------------------------------------------------------------------------------
1 | id ,productid,purchasedate,storeid
2 | 100,1        ,10/11/2019  ,1000
3 | 101,3        ,10/12/2019  ,1002
4 | 102,1        ,            ,1004
5 | 103,1        ,10/14/2019  ,1004
6 | 104,4        ,10/15/2019  ,1003
7 | 105,4        ,10/16/2019  ,1002
8 | 


--------------------------------------------------------------------------------
/tests/resources/store.csv:
--------------------------------------------------------------------------------
1 | id  ,name
2 | 1000,Borivili
3 | 1001,Kandivili
4 | 1002,Andheri
5 | 1003,Bandra
6 | 1004,Dadar
7 | 1005,Byculla
8 | 


--------------------------------------------------------------------------------
/tests/test_comprehensive_logging.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import unittest
 3 | import logging.config
 4 | 
 5 | from com.hellofresh.utils.comprehensive_logging import init_logging
 6 | 
 7 | 
 8 | class LoggingTestCases(unittest.TestCase):
 9 |     def test_init_logging(self):
10 |         init_logging(job_name='Unit tests')
11 |         logger = logging.getLogger('root')
12 |         self.assertEqual(logger.level, 10)
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     unittest.main()
17 | 


--------------------------------------------------------------------------------
/tests/test_data_quality.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | from pathlib import Path
 4 | 
 5 | from com.hellofresh.utils.data_quality import Rule, RuleExecutionResult, DataQuality
 6 | from com.hellofresh.utils.helpers import get_project_root, read_json_get_dict
 7 | from com.hellofresh.utils.spark import get_or_create_spark_session
 8 | 
 9 | 
10 | class DataQualityTestCases(unittest.TestCase):
11 |     def test_Rule(self):
12 |         rule_dict = {
13 |             "rule_id": 1011,
14 |             "name": "Primary / Natural Keys",
15 |             "description": "Primary / Natural Keys should not have duplicates",
16 |             "rule_type": "unique",
17 |             "columns": [
18 |                 "name"
19 |             ]
20 |         }
21 |         rule = Rule(**rule_dict)
22 |         self.assertEqual(rule.rule_id, 1011)
23 |         self.assertEqual(rule.name, "Primary / Natural Keys")
24 | 
25 |     def test_RuleExecutionResult(self):
26 |         rule_dict = {
27 |             "rule_id": 1011,
28 |             "name": "Primary / Natural Keys",
29 |             "description": "Primary / Natural Keys should not have duplicates",
30 |             "rule_type": "unique",
31 |             "columns": [
32 |                 "name"
33 |             ]
34 |         }
35 |         rule = Rule(**rule_dict)
36 |         result = RuleExecutionResult(rule, 'fail', 0, 0, 0)
37 |         self.assertEqual(result.status, 'fail')
38 |         self.assertEqual(result.rule, rule)
39 |         self.assertEqual(result.rule.rule_type, 'unique')
40 | 
41 |     def test_data_quality(self):
42 |         shutil.rmtree(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks")
43 |         t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json"
44 |         dq = read_json_get_dict(json_path=t1_dq)
45 |         dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks"
46 |         dq_rules = DataQuality(**dq)
47 |         spark = get_or_create_spark_session()
48 |         df = spark.read.option('encoding', 'utf-8').json(f"{get_project_root()}/resources/data/input")
49 |         execution_result = dq_rules.execute_rules(df=df)
50 |         self.assertEqual(execution_result[0], False)
51 |         self.assertTrue(execution_result[1].__contains__('<html>'))
52 |         dq_rules.write_report_to_html(file_name="task1-dq-report.html")
53 |         self.assertTrue(
54 |             Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file())
55 | 
56 | 
57 | if __name__ == '__main__':
58 |     unittest.main()
59 | 


--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import getpass
 3 | import unittest
 4 | from pathlib import Path
 5 | 
 6 | from isodate import ISO8601Error
 7 | 
 8 | from com.hellofresh.datapipelines.recipe_tasks import determine_cooking_difficulty
 9 | from com.hellofresh.utils.comprehensive_logging import init_logging
10 | from com.hellofresh.utils.helpers import get_user, get_project_root, convert_iso_to_time_duration, \
11 |     add_iso_time_duration
12 | 
13 | 
14 | class UtilsHelpersTestCases(unittest.TestCase):
15 |     init_logging(job_name='UtilsHelpersTestCases')
16 | 
17 |     def test_get_user(self):
18 |         user = get_user()
19 |         self.assertEqual(user, getpass.getuser())
20 | 
21 |     def test_get_project_root(self):
22 |         project_root_path: Path = get_project_root()
23 |         self.assertEqual(project_root_path.name, 'vim89-data-engineering-test')
24 | 
25 |     def test_convert_iso_to_time_duration(self):
26 |         try:
27 |             convert_iso_to_time_duration("")
28 |         except ValueError as v:
29 |             self.assertEqual(v.__str__(), 'Empty or Invalid time duration string')
30 | 
31 |         try:
32 |             convert_iso_to_time_duration("ABC")
33 |         except ISO8601Error as i:
34 |             self.assertEqual(i.__str__(), 'Error converting ISO time ABC to timedelta')
35 | 
36 |         iso_time = convert_iso_to_time_duration("PT100M")
37 |         self.assertEqual(iso_time, datetime.timedelta(hours=1, minutes=40))
38 | 
39 |         iso_time = convert_iso_to_time_duration("PT")
40 |         self.assertEqual(iso_time, datetime.timedelta(0))
41 | 
42 |     def test_add_iso_time_duration(self):
43 |         try:
44 |             add_iso_time_duration(time1="", time2="PT1H")
45 |         except ValueError as v:
46 |             self.assertEqual(v.__str__(), 'Empty or Invalid time duration string')
47 | 
48 |         iso_time = add_iso_time_duration(time1="PT100M", time2="PT1H")
49 |         self.assertEqual(iso_time, "PT2H40M")
50 | 
51 |         iso_time = add_iso_time_duration(time1="PT", time2="PT5M")
52 |         self.assertEqual(iso_time, "PT5M")
53 | 
54 |         iso_time = add_iso_time_duration(time1="PT", time2="PT")
55 |         self.assertEqual(iso_time, "P0D")
56 | 
57 |     def test_determine_difficulty(self):
58 |         difficulty = determine_cooking_difficulty(cook_time="PT", prep_time="PT")
59 |         self.assertEqual(difficulty, ('P0D', 'easy'))
60 | 
61 |         difficulty = determine_cooking_difficulty(cook_time="PT21H", prep_time="PT")
62 |         self.assertEqual(difficulty, ('PT21H', 'hard'))
63 | 
64 |         difficulty = determine_cooking_difficulty(cook_time="PT", prep_time="PT100M")
65 |         self.assertEqual(difficulty, ('PT1H40M', 'hard'))
66 | 
67 | 
68 | if __name__ == '__main__':
69 |     unittest.main()
70 | 


--------------------------------------------------------------------------------
/tests/test_logging_util.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import unittest
 3 | import logging.config
 4 | 
 5 | from utils.Utilities import init_logging
 6 | from utils.audit_util import audit_action
 7 | 
 8 | 
 9 | class TestLoggingUtil(unittest.TestCase):
10 |     def test_init_logging(self):
11 |         init_logging(log_time_stamp=datetime.datetime.now())
12 |         level20 = logging.getLogger('simpleExample').level
13 |         self.assertEqual(level20, 20)
14 | 
15 |     def test_audit_action(self):
16 |         @audit_action(action=f"testing Audit Action Wrapper")
17 |         def audit_decorator():
18 |             pass
19 | 
20 | 
21 | if __name__ == '__main__':
22 |     unittest.main()
23 | 


--------------------------------------------------------------------------------
/tests/test_recipe_tasks.py:
--------------------------------------------------------------------------------
 1 | import shutil
 2 | import unittest
 3 | from pathlib import Path
 4 | 
 5 | from com.hellofresh.datapipelines.recipe_tasks import main, task1, task2, determine_cooking_difficulty, \
 6 |     calculate_time_duration_average, standardize_and_rename_df_columns
 7 | from com.hellofresh.utils.data_quality import DataQuality
 8 | from com.hellofresh.utils.helpers import get_project_root, read_json_get_dict, convert_iso_to_time_duration
 9 | from com.hellofresh.utils.spark import get_or_create_spark_session
10 | 
11 | 
12 | def del_dirs():
13 |     try:
14 |         shutil.rmtree(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks")
15 |         shutil.rmtree(f"{get_project_root()}/resources/data/output/task1")
16 |         shutil.rmtree(f"{get_project_root()}/resources/data/output/task2")
17 |     except:
18 |         pass
19 | 
20 | 
21 | class RecipeTasksTestCases(unittest.TestCase):
22 | 
23 |     @classmethod
24 |     def setUpClass(self):
25 |         self.args = {
26 |             'input_data_dir': f"{get_project_root()}/resources/data/input",
27 |             'output_data_dir': f"{get_project_root()}/resources/data/output"
28 |         }
29 |         del_dirs()
30 | 
31 |     @unittest.skip
32 |     def test_main(self):
33 |         del_dirs()
34 |         t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json"
35 |         t2_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json"
36 |         main(self.args, t1_dq, t2_dq)
37 |         self.assertTrue(
38 |             Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file())
39 | 
40 |     def test_task1(self):
41 |         del_dirs()
42 |         t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json"
43 |         dq = read_json_get_dict(json_path=t1_dq)
44 |         dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks"
45 |         dq_rules = DataQuality(**dq)
46 |         task1(input_data_path=self.args['input_data_dir'], input_file_type='json', dq_rules=dq_rules,
47 |               output_data_path=f"{self.args['output_data_dir']}/task1", spark_opts={'encoding': 'utf-8'})
48 | 
49 |         self.spark = get_or_create_spark_session()
50 |         df = self.spark.read.parquet(f"{self.args['output_data_dir']}/task1")
51 |         self.assertEqual(df.count(), 1042)
52 |         self.assertTrue(df.columns.__contains__('cook_time'))
53 |         self.assertTrue(
54 |             Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file())
55 | 
56 |     def test_task2(self):
57 |         t2_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json"
58 |         dq = read_json_get_dict(json_path=t2_dq)
59 |         dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks"
60 |         dq_rules = DataQuality(**dq)
61 |         task2(input_data_path=f"{self.args['output_data_dir']}/task1", input_file_type='parquet', dq_rules=dq_rules,
62 |               output_data_path=f"{self.args['output_data_dir']}/task2")
63 | 
64 |         self.spark = get_or_create_spark_session()
65 |         df = self.spark.read.csv(f"{self.args['output_data_dir']}/task2", header=True)
66 |         self.assertEqual(df.count(), 3)
67 |         self.assertTrue(df.columns.__contains__('avg_total_cooking_time'))
68 |         self.assertTrue(
69 |             Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task2-dq-report.html").is_file())
70 | 
71 |     def test_determine_cooking_difficulty(self):
72 |         difficulty = determine_cooking_difficulty("PT1H", "PT2M")
73 |         self.assertEqual(difficulty, ('PT1H2M', 'hard'))
74 |         difficulty = determine_cooking_difficulty("PT5M", "PT15M")
75 |         self.assertEqual(difficulty, ('PT20M', 'easy'))
76 |         difficulty = determine_cooking_difficulty("PT15M", "PT20M")
77 |         self.assertEqual(difficulty, ('PT35M', 'medium'))
78 |         difficulty = determine_cooking_difficulty("PT", "PT")
79 |         self.assertEqual(difficulty, ('P0D', 'easy'))
80 | 
81 |         try:
82 |             difficulty = determine_cooking_difficulty("", "PT1H")
83 |             print(difficulty)
84 |         except Exception as ex:
85 |             self.assertEqual(ex.__str__(), 'Expecting a string None')
86 | 
87 |     def test_calculate_time_duration_average(self):
88 |         list_of_time_duration = list(map(lambda t: convert_iso_to_time_duration(t), ["PT1H", "PT30M", "PT", "PT2H5M"]))
89 |         avg = calculate_time_duration_average(list_of_time_duration)
90 |         self.assertEqual(avg, 'PT53M45S')
91 | 
92 | 
93 | if __name__ == '__main__':
94 |     unittest.main()
95 | 


--------------------------------------------------------------------------------
/tests/test_spark.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | from pyspark.sql import SparkSession
 4 | 
 5 | from com.hellofresh.utils.comprehensive_logging import init_logging
 6 | from com.hellofresh.utils.spark import get_or_create_spark_session, standardize_and_rename_df_columns, \
 7 |     read_data_as_spark_dataframe, data_frame_repartition
 8 | 
 9 | 
10 | class UtilsSparkTestCases(unittest.TestCase):
11 |     init_logging(job_name='UtilsSparkTestCases')
12 | 
13 |     def test_create_spark_session(self):
14 |         spark: SparkSession = get_or_create_spark_session()
15 |         self.assertIsNot(spark, None)
16 |         self.assertEqual(spark, SparkSession.getActiveSession())
17 |         self.assertEqual(spark.sparkContext.appName.__str__(), 'pyspark-shell')
18 | 
19 |     def test_standardize_and_rename_df_columns(self):
20 |         spark = get_or_create_spark_session()
21 |         data = [('Category A', 100, "This is category A"),
22 |                 ('Category B', 120, "This is category B"),
23 |                 ('Category C', 150, "This is category C")]
24 |         df = spark.sparkContext.parallelize(data).toDF(['cateGory ', ' iD ', 'category description'])
25 | 
26 |         self.assertEqual(df.columns, ['cateGory ', ' iD ', 'category description'])
27 | 
28 |         df = standardize_and_rename_df_columns(df=df,
29 |                                                column_names_to_rename={'category description': 'category_description'})
30 |         self.assertEqual(df.columns, ['category', 'id', 'category_description'])
31 | 
32 |     def test_negative_cases_for_read_data_as_spark_dataframe(self):
33 | 
34 |         # INVALID
35 |         try:
36 |             df = read_data_as_spark_dataframe(filetype='invalid', location='a://a.txt')
37 |         except Exception as ex:
38 |             print(ex.__str__())
39 |             self.assertEqual(ex.__str__(), 'Invalid filetype: invalid')
40 | 
41 |         # CSV
42 |         try:
43 |             df = read_data_as_spark_dataframe(filetype='csv', location='a://a.csv')
44 |             csv_read = 'successful'
45 |         except Exception as ex:
46 |             csv_read = 'failed'
47 | 
48 |         self.assertAlmostEqual(csv_read, 'failed')
49 | 
50 |         # TEXT
51 |         try:
52 |             df = read_data_as_spark_dataframe(filetype='text', location='a://a.txt')
53 |             text_read = 'successful'
54 |         except Exception as ex:
55 |             text_read = 'failed'
56 | 
57 |         self.assertAlmostEqual(text_read, 'failed')
58 | 
59 |         # XML
60 |         try:
61 |             df = read_data_as_spark_dataframe(filetype='xml', location='a://a.xml')
62 |             xml_read = 'successful'
63 |         except Exception as ex:
64 |             xml_read = 'failed'
65 | 
66 |         self.assertAlmostEqual(xml_read, 'failed')
67 | 
68 |         # Table
69 |         try:
70 |             df = read_data_as_spark_dataframe(filetype='table', location='a://a.xml')
71 |             table_read = 'successful'
72 |         except Exception as ex:
73 |             table_read = 'failed'
74 | 
75 |         self.assertAlmostEqual(table_read, 'failed')
76 | 
77 |     def test_data_frame_repartition(self):
78 |         spark = get_or_create_spark_session()
79 |         data = [('Category A', 100, "This is category A"),
80 |                 ('Category B', 120, "This is category B"),
81 |                 ('Category C', 150, "This is category C")]
82 |         df = spark.sparkContext.parallelize(data).toDF(['category', 'id', 'category_description'])
83 | 
84 |         df = data_frame_repartition(df=df, use_coalesce=True, num_files=1)
85 |         self.assertTrue(df is not None)
86 | 
87 |         df = data_frame_repartition(df=df, num_files=5, repartition_columns=['category'])
88 |         self.assertFalse(df.columns.__contains__('temp_repartition_column'))
89 | 
90 | 
91 | if __name__ == '__main__':
92 |     unittest.main()
93 | 


--------------------------------------------------------------------------------
/tests/test_spark_submit_execution_pool.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | from datetime import datetime
 3 | 
 4 | from utils.Utilities import init_logging, create_multiprocess_pool, execute_bash
 5 | 
 6 | 
 7 | class TestSparkSubmitExecutionPool(unittest.TestCase):
 8 |     init_logging(datetime.now())
 9 | 
10 |     def test_create_multiprocess_pool(self):
11 |         bash_commands = [
12 |             'echo "cmd1"',
13 |             'echo "cmd2"',
14 |             'echo "cmd3"',
15 |             'hadoop version',
16 |             'echo "cmd5"',
17 |             'echo "cmd6"',
18 |             'echo "cmd7"',
19 |             'echo "cmd8"',
20 |             'echo "cmd9"',
21 |             'echo "cmd10"',
22 |             'echo "cmd11"'
23 |         ]
24 |         results, failures = create_multiprocess_pool(
25 |             shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
26 |             command_list=bash_commands,
27 |             sleep_time=0,
28 |             max_parallel_jobs=6
29 |         )
30 | 
31 |         bash_commands.append('spark-submit')
32 |         _results, _failures = create_multiprocess_pool(
33 |             shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
34 |             command_list=bash_commands,
35 |             sleep_time=0,
36 |             max_parallel_jobs=6
37 |         )
38 | 
39 |         self.assertEqual(len(failures), 0)
40 |         self.assertEqual(len(_failures) > 0, True)
41 | 
42 |     def test_execute_bash(self):
43 |         pid, return_code, yarn_application_id, stdout, stderr = \
44 |             execute_bash(shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
45 |                          sleep_time=0, cmd='hadoop version')
46 | 
47 |         _pid, _return_code, _yarn_application_id, _stdout, _stderr = \
48 |             execute_bash(shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
49 |                          sleep_time=0, cmd='spark-submit')
50 |         self.assertNotEqual(pid, None)
51 |         self.assertNotEqual(stderr, None)
52 |         self.assertNotEqual(stdout, None)
53 | 
54 |         self.assertEqual(len(yarn_application_id), 0)
55 |         self.assertEqual(return_code == 0, True)
56 | 
57 |         self.assertNotEqual(_pid, None)
58 |         self.assertNotEqual(_stderr, None)
59 |         self.assertNotEqual(_stdout, None)
60 | 
61 |         self.assertEqual(len(_yarn_application_id), 0)
62 |         self.assertEqual(_return_code > 0, True)
63 | 
64 | 
65 | if __name__ == '__main__':
66 |     unittest.main()
67 | 


--------------------------------------------------------------------------------
/tests/test_spark_submit_utils.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | import logging
 3 | import unittest
 4 | 
 5 | from utils.Utilities import init_logging, cast_string_to_date, get_project_root, read_json_get_dict, read_yaml_get_dict
 6 | from utils.spark_submit_utils import prepare_spark_submit
 7 | 
 8 | 
 9 | class TestSparkSubmitUtils(unittest.TestCase):
10 |     init_logging(datetime.datetime.now())
11 | 
12 |     def test_get_project_root(self):
13 |         self.assertEqual(get_project_root().__str__(), '/Users/v0m02sj/PycharmProjects/datapipelines-essentials')
14 | 
15 |     def test_cast_string_to_date(self):
16 |         dt = cast_string_to_date('2020-01-01', '%Y-%m-%d')
17 |         _dt = cast_string_to_date('abcdefg', '%Y-%m-%d')
18 |         self.assertEqual(type(dt), datetime.datetime)
19 |         self.assertEqual(_dt, None)
20 | 
21 |     def test_prepare_spark_submit_command(self):
22 |         application_properties = read_json_get_dict(
23 |             json_path=f"{get_project_root()}/main/src/resources/config/application_properties.json")
24 |         runtime_args = {}  # parse_arguments(application_properties.get('command_line_args'))
25 |         runtime_args.update({
26 |             "workflow": "DVSkuDailyChannelWorkFlow",
27 |             "refreshType": "history",
28 |             "startDate": "2020-01-01",
29 |             "endDate": "2020-01-10",
30 |             "dq_enabled": "Y",
31 |             "configFile": "/Users/v0m02sj/IdeaProjects/channel-perf-data-pipeline/configs/config-prod.yml"
32 |         })
33 |         static_args = read_yaml_get_dict(runtime_args['configFile'])
34 |         runtime_args.update({'configFile': runtime_args['configFile'].split('/')[-1]})
35 |         commands = prepare_spark_submit(runtime_args=runtime_args, config_args=static_args,
36 |                                         app_config=application_properties)
37 |         logging.debug(commands)
38 |         self.assertEqual(len(commands) > 0, True)
39 | 
40 | 
41 | if __name__ == '__main__':
42 |     unittest.main()
43 | 


--------------------------------------------------------------------------------