├── .coveragerc
├── .gitignore
├── LICENSE
├── MANIFEST.in
├── README.md
├── conf
├── data-quality
│ ├── example-dq-report.html
│ └── rules
│ │ ├── production_configs
│ │ ├── recipe-task1-dq-rules.json
│ │ └── recipe-task2-dq-rules.json
│ │ └── unit_test_configs
│ │ ├── recipe-task1-dq-rules.json
│ │ └── recipe-task2-dq-rules.json
├── python
│ └── logging-properties.json
└── spark
│ ├── log4j.properties
│ └── sparkConf.conf
├── docs
├── APIDOC.MD
├── ETL_README.md
├── PysparkLocalSetup.docx
├── SETUP.MD
├── apidocumentation.html
├── images
│ ├── DataQualityUML.png
│ ├── XMLParse.png
│ ├── dq-task1.png
│ ├── dq-task2.png
│ ├── task1_ouput_er.png
│ └── task2_ouput_er.png
└── setup.html
├── logs
├── bash
│ └── logs
└── python
│ └── log-sample
├── requirements.txt
├── resources
├── data-quality-reports
│ └── recipe-tasks
│ │ ├── task1-dq-report.html
│ │ └── task2-dq-report.html
└── data
│ ├── clinical_trial
│ ├── data
│ │ └── chunk1.zip
│ ├── job_parameters
│ │ └── clinical_trial.json
│ ├── sql
│ │ └── transformations
│ │ │ └── sponsors.sql
│ └── xml
│ │ ├── clinical_study_xsd.xsd
│ │ └── default_clinical_study.xml
│ ├── config
│ ├── application_properties.json
│ ├── application_properties.yaml
│ └── logging.yaml
│ ├── product.csv
│ ├── purchase.csv
│ ├── recipes
│ ├── input
│ │ ├── recipes-000.json
│ │ ├── recipes-001.json
│ │ └── recipes-002.json
│ └── output
│ │ ├── task1
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc
│ │ ├── _SUCCESS
│ │ └── part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet
│ │ └── task2
│ │ ├── ._SUCCESS.crc
│ │ ├── .part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc
│ │ ├── _SUCCESS
│ │ └── part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv
│ └── store.csv
├── sbin
├── common_functions.sh
├── create_python_venv.sh
└── execute-tasks-spark-submit.sh
├── setup.py
├── src
└── com
│ ├── __init__.py
│ └── vitthalmirji
│ ├── __init__.py
│ ├── datapipelines
│ ├── __init__.py
│ ├── clinical_trial
│ │ ├── __init__.py
│ │ └── clinical_trial_etl.py
│ └── recipe_tasks.py
│ ├── datawarehousing
│ ├── __init__.py
│ └── change_data_capture.py
│ ├── etl
│ ├── CColumn.py
│ ├── ETL.py
│ ├── ETLTransform.py
│ ├── ITable.py
│ ├── __init__.py
│ └── meta
│ │ ├── MetaModel.py
│ │ └── __init__.py
│ ├── imports
│ ├── HdfsImport.py
│ └── __init__.py
│ ├── kafka
│ ├── Logger.py
│ └── __init__.py
│ ├── main.py
│ ├── mapper
│ ├── Mapper.py
│ └── __init__.py
│ ├── objects
│ ├── __init__.py
│ └── enums
│ │ ├── Environments.py
│ │ ├── Zones.py
│ │ └── __init__.py
│ └── utils
│ ├── MockupData.py
│ ├── Utilities.py
│ ├── __init__.py
│ ├── audit_util.py
│ ├── comprehensive_logging.py
│ ├── constants.py
│ ├── data_quality.py
│ ├── helpers.py
│ ├── logging_util.py
│ ├── spark.py
│ ├── spark_submit_utils.py
│ └── transformation_extension.py
└── tests
├── EtlTransformTest.py
├── UtilsTest.py
├── XmlMapperTest.py
├── aws_test
├── AwsS3Test.py
├── __init__.py
├── glue_job.py
├── test_glue_job.py
├── test_mocked_postgres.py
├── test_mocked_redshift.py
├── test_mocked_redshift_infra.py
└── testing_mocked_s3.py
├── resources
├── config.yml
├── datamodel.csv
├── meta.csv
├── mock_dataframe.txt
├── product.csv
├── purchase.csv
└── store.csv
├── test_comprehensive_logging.py
├── test_data_quality.py
├── test_helpers.py
├── test_logging_util.py
├── test_recipe_tasks.py
├── test_spark.py
├── test_spark_submit_execution_pool.py
└── test_spark_submit_utils.py
/.coveragerc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/.coveragerc
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | ### Python template
2 |
3 | # Bash script logs
4 | *.log
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | # Distribution / packaging
12 | .Python
13 | build/
14 | develop-eggs/
15 | dist/
16 | downloads/
17 | eggs/
18 | .eggs/
19 | lib/
20 | lib64/
21 | parts/
22 | sdist/
23 | var/
24 | wheels/
25 | pip-wheel-metadata/
26 | share/python-wheels/
27 | *.egg-info/
28 | .installed.cfg
29 | *.egg
30 | MANIFEST
31 | src/test/metastore_db
32 | src/src/main/test/hive
33 | src/test/spark-warehouse
34 | src/test/derby.log
35 |
36 | # PyInstaller
37 | # Usually these files are written by a python script from a template
38 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
39 | *.manifest
40 | *.spec
41 |
42 | # Installer logs
43 | pip-log.txt
44 | pip-delete-this-directory.txt
45 |
46 | # Unit test / coverage reports
47 | htmlcov/
48 | .tox/
49 | .nox/
50 | .coverage
51 | .coverage.*
52 | .cache
53 | nosetests.xml
54 | coverage.xml
55 | *.cover
56 | .hypothesis/
57 | .pytest_cache/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # PyBuilder
64 | target/
65 |
66 | # pyenv
67 | .python-version
68 |
69 | # Environments
70 | .env
71 | .venv
72 | env/
73 | venv/
74 | ENV/
75 | env.bak/
76 | venv.bak/
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include sbin *.sh
2 | recursive-include conf *.html *.json *.conf *.properties *.html
3 | recursive-include resources *.html
4 | recursive-include logs logs log-sample
5 | recursive-include docs *.md
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Datalake ETL Pipeline
2 | Data transformation simplified for any Data platform.
3 |
4 | `Features:` The package has complete ETL process -
5 | 1. Uses metadata, transformation & data model information to design ETL pipeline
6 | 2. Builds target transformation SparkSQL and Spark Dataframes
7 | 3. Builds source & target Hive DDLs
8 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions.
9 | 5. Supports below fundamental transformations for ETL pipeline -
10 | * Filters on source & target dataframes
11 | * Grouping and Aggregations on source & target dataframes
12 | * Heavily nested queries / dataframes
13 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
14 | level of nesting
15 | 7. Has Unit test cases designed on function/method level & measures
16 | source code coverage
17 | 8. Has information about delpoying to higher environments
18 | 9. Has API documentation for customization & enhancement
19 |
20 | `Enhancements:` In progress -
21 | 1. Integrate Audit and logging - Define Error codes, log process
22 | failures, Audit progress & runtime information
--------------------------------------------------------------------------------
/conf/data-quality/example-dq-report.html:
--------------------------------------------------------------------------------
1 |
Team, Data Quality check finished successfully for DQ ID = 101 , with failures. Check details in below table of metrics.
Failed DQ details Yarn Application Id DQ ID Rule ID Rule Name Rule type Description Columns/Query Pass Count Fail Count Total Count local-1681916910001 101 1011 Primary / Natural Keys unique Primary / Natural Keys should not have duplicates ['name'] 1039 3 1042 local-1681916910001 101 1012 NOT NULL fields not null Field should have valid value ['name', 'cookTime', 'prepTime'] 715 327 1042
Succeeded DQ details Yarn Application Id DQ ID Rule ID Rule Name Rule type Description Columns/Query Pass Count Fail Count Total Count local-1681916910001 101 1013 File names check query Check If all input files are read for processing ["WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"] 1042 0 1042
Thanks
--------------------------------------------------------------------------------
/conf/data-quality/rules/production_configs/recipe-task1-dq-rules.json:
--------------------------------------------------------------------------------
1 | {
2 | "dq_id": 101,
3 | "rules": [
4 | {
5 | "rule_id": 1011,
6 | "name": "Primary / Natural Keys",
7 | "description": "Primary / Natural Keys should not have duplicates",
8 | "rule_type": "unique",
9 | "columns": [
10 | "name"
11 | ]
12 | },
13 | {
14 | "rule_id": 1012,
15 | "name": "NOT NULL fields",
16 | "description": "Field should have valid value",
17 | "rule_type": "not null",
18 | "columns": [
19 | "name",
20 | "cookTime",
21 | "prepTime"
22 | ]
23 | },
24 | {
25 | "rule_id": 1013,
26 | "name": "Input files check",
27 | "description": "Check If all input files are read for processing",
28 | "rule_type": "query",
29 | "query": "WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"
30 | },
31 | {
32 | "rule_id": 1014,
33 | "name": "\"Check for invalid cook & prep time",
34 | "description": "Check empty or null values",
35 | "rule_type": "query",
36 | "query": "SELECT * FROM temp WHERE cookTime = '' OR prepTime = ''"
37 | }
38 | ]
39 | }
40 |
--------------------------------------------------------------------------------
/conf/data-quality/rules/production_configs/recipe-task2-dq-rules.json:
--------------------------------------------------------------------------------
1 | {
2 | "dq_id": 101,
3 | "rules": [
4 | {
5 | "rule_id": 1015,
6 | "name": "Primary / Natural Keys",
7 | "description": "Primary / Natural Keys should not have duplicates",
8 | "rule_type": "unique",
9 | "columns": [
10 | "difficulty"
11 | ]
12 | },
13 | {
14 | "rule_id": 1016,
15 | "name": "NOT NULL fields",
16 | "description": "Field should have valid value",
17 | "rule_type": "not null",
18 | "columns": [
19 | "difficulty",
20 | "avg_total_cooking_time"
21 | ]
22 | }
23 | ]
24 | }
25 |
--------------------------------------------------------------------------------
/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json:
--------------------------------------------------------------------------------
1 | {
2 | "dq_id": 101,
3 | "execution_reports_dir": "/resources/data-quality-reports/recipe-tasks",
4 | "email_execution_report_to": "vitthalmirji@gmail.com",
5 | "rules": [
6 | {
7 | "rule_id": 1011,
8 | "name": "Primary / Natural Keys",
9 | "description": "Primary / Natural Keys should not have duplicates",
10 | "rule_type": "unique",
11 | "columns": [
12 | "name"
13 | ]
14 | },
15 | {
16 | "rule_id": 1012,
17 | "name": "NOT NULL fields",
18 | "description": "Field should have valid value",
19 | "rule_type": "not null",
20 | "columns": [
21 | "name",
22 | "cookTime",
23 | "prepTime"
24 | ]
25 | },
26 | {
27 | "rule_id": 1013,
28 | "name": "Input files check",
29 | "description": "Check If all input files are read for processing",
30 | "rule_type": "query",
31 | "query": "WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"
32 | },
33 | {
34 | "rule_id": 1014,
35 | "name": "\"Check for invalid cook & prep time",
36 | "description": "Check empty or null values",
37 | "rule_type": "query",
38 | "query": "SELECT * FROM temp WHERE cookTime = '' OR prepTime = ''"
39 | }
40 | ]
41 | }
42 |
--------------------------------------------------------------------------------
/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json:
--------------------------------------------------------------------------------
1 | {
2 | "dq_id": 101,
3 | "execution_reports_dir": "/resources/data-quality-reports/recipe-tasks",
4 | "email_execution_report_to": "vitthalmirji@gmail.com",
5 | "rules": [
6 | {
7 | "rule_id": 1015,
8 | "name": "Primary / Natural Keys",
9 | "description": "Primary / Natural Keys should not have duplicates",
10 | "rule_type": "unique",
11 | "columns": [
12 | "difficulty"
13 | ]
14 | },
15 | {
16 | "rule_id": 1016,
17 | "name": "NOT NULL fields",
18 | "description": "Field should have valid value",
19 | "rule_type": "not null",
20 | "columns": [
21 | "difficulty",
22 | "avg_total_cooking_time"
23 | ]
24 | }
25 | ]
26 | }
27 |
--------------------------------------------------------------------------------
/conf/python/logging-properties.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "objects": {
4 | "queue": {
5 | "class": "queue.Queue",
6 | "maxsize": 1000
7 | }
8 | },
9 | "formatters": {
10 | "simple": {
11 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
12 | },
13 | "detailed": {
14 | "format": "%(asctime)s %(name)-15s %(levelname)-8s %(process)-10d %(funcName)-30s %(message)s"
15 | }
16 | },
17 | "handlers": {
18 | "console": {
19 | "class": "logging.StreamHandler",
20 | "level": "DEBUG",
21 | "formatter": "detailed",
22 | "stream": "ext://sys.stdout"
23 | },
24 | "file": {
25 | "class": "logging.FileHandler",
26 | "level": "DEBUG",
27 | "encoding": "utf-8",
28 | "formatter": "detailed",
29 | "filename": "logs/log-{job_name_placeholder}_{timestamp_placeholder}.log",
30 | "mode": "a"
31 | }
32 | },
33 | "loggers": {
34 | "simple": {
35 | "level": "INFO",
36 | "handlers": [
37 | "console"
38 | ],
39 | "propagate": "no"
40 | },
41 | "unit-tests": {
42 | "level": "DEBUG",
43 | "handlers": [
44 | "console"
45 | ],
46 | "propagate": "no"
47 | }
48 | },
49 | "root": {
50 | "level": "DEBUG",
51 | "handlers": [
52 | "console",
53 | "file"
54 | ]
55 | }
56 | }
57 |
--------------------------------------------------------------------------------
/conf/spark/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set everything to be logged to the console
2 | #Global logging
3 | log4j.rootCategory=WARN, console
4 | log4j.appender.console=org.apache.log4j.ConsoleAppender
5 | log4j.appender.console.target=System.err
6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
7 | log4j.appender.console.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
8 |
9 | # Spark 3.x
10 | log4j.logger.org.sparkproject.jetty.server.handler.ContextHandler=WARN
11 |
12 | # Spark 2.x
13 | log4j.logger.org.spark_project.jetty.server.handler.ContextHandler=WARN
14 |
15 | # Send WARN or higher to stderr
16 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender
17 | log4j.appender.stderr.Threshold=WARN
18 | log4j.appender.stderr.Target=System.err
19 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout
20 | log4j.appender.stderr.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
21 |
22 | # Parquet related logging
23 | log4j.logger.parquet.name = org.apache.parquet.CorruptStatistics
24 | log4j.logger.parquet.level = WARN
25 | log4j.logger.parquet2.name = parquet.CorruptStatistics
26 | log4j.logger.parquet2.level = WARN
27 |
28 | # Hive metastore related logging
29 | logger.metastore.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler
30 | logger.metastore.level = FATAL
31 | logger.hive_functionregistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry
32 | logger.hive_functionregistry.level = ERROR
33 |
34 | # Settings to quiet third party logs that are too verbose
35 | log4j.logger.org.eclipse.jetty=WARN
36 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR
37 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO
38 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO
39 |
40 | # Reduce verbosity for other spammy core classes.
41 | log4j.logger.org.apache.spark=WARN
42 | log4j.logger.org.apache.spark.util=ERROR
43 | log4j.logger.org.apache.spark.network=WARN
44 | log4j.logger.akka=WARN
45 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN
46 | log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN
47 |
48 | # Hello Fresh com.vitthalmirji logging into separate file
49 | log4j.logger.com.vitthalmirji=INFO, vimAppender
50 | log4j.additivity.com.vitthalmirji=false
51 | log4j.appender.vimAppender=org.apache.log4j.FileAppender
52 | log4j.appender.vimAppender.File=${spark.yarn.app.container.log.dir}/stdout
53 | log4j.appender.vimAppender.layout=org.apache.log4j.PatternLayout
54 | log4j.appender.vimAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
55 |
56 | # Spark Bigquery logging into separate file
57 | log4j.logger.com.google.cloud.spark.bigquery=INFO, sparkbigqueryAppender
58 | log4j.additivity.com.google.cloud.spark.bigquery=false
59 | log4j.appender.sparkbigqueryAppender=org.apache.log4j.FileAppender
60 | log4j.appender.sparkbigqueryAppender.File=${spark.yarn.app.container.log.dir}/spark-big-query.log
61 | log4j.appender.sparkbigqueryAppender.layout=org.apache.log4j.PatternLayout
62 | log4j.appender.sparkbigqueryAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
63 |
64 | # Bigquery logging into separate file
65 | log4j.logger.com.google.cloud.bigquery=INFO, bigqueryAppender
66 | log4j.additivity.com.google.cloud.bigquery=false
67 | log4j.appender.bigqueryAppender=org.apache.log4j.FileAppender
68 | log4j.appender.bigqueryAppender.File=${spark.yarn.app.container.log.dir}/big-query.log
69 | log4j.appender.bigqueryAppender.layout=org.apache.log4j.PatternLayout
70 | log4j.appender.bigqueryAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
71 |
72 | # Hudi logging into separate file
73 | log4j.logger.org.apache.hudi=INFO, hudiAppender
74 | log4j.additivity.org.apache.hudi=false
75 | log4j.appender.hudiAppender=org.apache.log4j.FileAppender
76 | log4j.appender.hudiAppender.File=${spark.yarn.app.container.log.dir}/hudi.log
77 | log4j.appender.hudiAppender.layout=org.apache.log4j.PatternLayout
78 | log4j.appender.hudiAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
79 |
80 | # Cosmos logging into separate file
81 | log4j.logger.com.microsoft.azure.cosmosdb=INFO, cosmosdbAppender
82 | log4j.additivity.com.microsoft.azure.cosmosdb=false
83 | log4j.appender.cosmosdbAppender=org.apache.log4j.FileAppender
84 | log4j.appender.cosmosdbAppender.File=${spark.yarn.app.container.log.dir}/cosmosdb.log
85 | log4j.appender.cosmosdbAppender.layout=org.apache.log4j.PatternLayout
86 | log4j.appender.cosmosdbAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
87 |
88 | # GCS logging into separate file
89 | log4j.logger.com.google.cloud.storage=INFO, gcsAppender
90 | log4j.additivity.com.google.cloud.storage=false
91 | log4j.appender.gcsAppender=org.apache.log4j.FileAppender
92 | log4j.appender.gcsAppender.File=${spark.yarn.app.container.log.dir}/gcs.log
93 | log4j.appender.gcsAppender.layout=org.apache.log4j.PatternLayout
94 | log4j.appender.gcsAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n
95 |
--------------------------------------------------------------------------------
/conf/spark/sparkConf.conf:
--------------------------------------------------------------------------------
1 | GLOBAL {
2 | "master" = "yarn"
3 | "hive.exec.dynamic.partition.mode" = "nonstrict"
4 | "hive.exec.dynamic.partition" = "true"
5 | "spark.sql.sources.partitionOverwriteMode" = "dynamic"
6 | "mapreduce.fileoutputcommitter.algorithm.version" = "2"
7 | "parquet.enable.summary-metadata" = "false"
8 | "parquet.compression" = "snappy"
9 | "spark.sql.parquet.mergeSchema" = "false"
10 | "spark.sql.parquet.filterPushdown" = "true"
11 | "spark.sql.hive.metastorePartitionPruning" = "true"
12 | "spark.sql.orc.filterPushdown" = "true"
13 | "spark.sql.orc.splits.include.file.footer" = "true"
14 | "spark.sql.orc.cache.stripe.details.size" = "10000"
15 | "spark.sql.broadcastTimeout" = "1800"
16 | }
17 |
18 | LOCAL {
19 | "master" = "local[*]"
20 | "spark.hadoop.hive.exec.dynamic.partition.mode" = "nonstrict"
21 | "spark.hadoop.hive.exec.dynamic.partition" = "true"
22 | "spark.sql.sources.partitionOverwriteMode" = "dynamic"
23 | "spark.executor.instances" = "1"
24 | }
25 |
--------------------------------------------------------------------------------
/docs/APIDOC.MD:
--------------------------------------------------------------------------------
1 | # Datalake ETL Pipeline
2 | Data transformation simplified for any Data platform.
3 |
4 | `Features:` The package has complete ETL process -
5 | 1. Uses metadata, transformation & data model information to design ETL pipeline
6 | 2. Builds target transformation SparkSQL and Spark Dataframes
7 | 3. Builds source & target Hive DDLs
8 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions.
9 | 5. Supports below fundamental transformations for ETL pipeline -
10 | * Filters on source & target dataframes
11 | * Grouping and Aggregations on source & target dataframes
12 | * Heavily nested queries / dataframes
13 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
14 | level of nesting
15 | 7. Has Unit test cases designed on function/method level & measures
16 | source code coverage
17 | 8. Has information about delpoying to higher environments
18 | 9. Has API documentation for customization & enhancement
19 |
20 | `Enhancements:` In progress -
21 | 1. Integrate Audit and logging - Define Error codes, log process
22 | failures, Audit progress & runtime information
23 |
24 | # Datalake ETL Pipeline API documentation
25 | ## Mappers for complex/nested data sources
26 | * Has interface `IMapper` and implemented concrete class `XmlMapper`. We
27 | can use same abstract / interface for other category of file mapping
28 | viz. XML/JSON/Parquet/ORC.
29 | * Core methods/function common for overriding
30 | are – `getDataframeSchema`, `createDDL`, `complexTypeIterator`,
31 | `handleStructType`, `handleArrayType`
32 |
33 | * Overview of complex type parsing & exploding -
34 | * 
35 | ```
36 | def handleStructType(self, viewname, viewpath, database, table, xpath, level, dtype, acc={}, xpaths=[])
37 | ```
38 | ```
39 | def handleArrayType(self, viewname, viewpath, database, table, xpath, level, dtype: ArrayType, acc={}, xpaths=[])
40 | ```
41 | ```
42 | def complexTypeIterator(self, viewname, viewpath, database, table, xpath, level, dtype: DataType, acc={}, xpaths=[])
43 | ```
44 |
45 | ### XmlMapper
46 | * `XmlMapper` specific methods / functions – `createViewsAndXpaths`,
47 | `buildXmlSerdeDDL`
48 |
49 | ```
50 | def createViewsAndXpaths(self, df: DataFrame, database, table)
51 | ```
52 | ```
53 | def buildXmlSerdeDdl(self, database, table, xmlsourcelocation, xmlrowstarttag, xmlrowendtag)
54 | ```
55 |
56 | ## Pyspark Core Class Extensions
57 |
58 | ```
59 | from etl.meta import *
60 | ```
61 |
62 | ### Column Extensions
63 |
64 | **isFalsy()**
65 |
66 | ```python
67 | source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy())
68 | ```
69 |
70 | Returns `True` if `has_stuff` is `None` or `False`.
71 |
72 | **isTruthy()**
73 |
74 | ```python
75 | source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy())
76 | ```
77 |
78 | Returns `True` unless `has_stuff` is `None` or `False`.
79 |
80 | **isNullOrBlank()**
81 |
82 | ```python
83 | source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank())
84 | ```
85 |
86 | Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace).
87 |
88 | **isNotIn()**
89 |
90 | ```python
91 | source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies))
92 | ```
93 |
94 | Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list.
95 |
96 | **nullBetween()**
97 |
98 | ```python
99 | source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age")))
100 | ```
101 |
102 | Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`.
103 |
104 | ### SparkSession Extensions
105 |
106 | **create_df()**
107 |
108 | ```python
109 | spark.create_df(
110 | [("jose", "a"), ("li", "b"), ("sam", "c")],
111 | [("name", StringType(), True), ("blah", StringType(), True)]
112 | )
113 | ```
114 |
115 | Creates DataFrame with a syntax that's less verbose than the built-in `createDataFrame` method.
116 |
117 | ### DataFrame Extensions
118 |
119 | **applyTransform()**
120 |
121 | ```python
122 | source_df\
123 | .applyTransform(lambda df: with_greeting(df))\
124 | .applyTransform(lambda df: with_something(df, "crazy"))
125 | ```
126 |
127 | Allows for multiple DataFrame transformations to be run and executed.
128 |
129 | ## Helper Functions
130 |
131 | ```python
132 |
133 | import etl
134 | ```
135 |
136 | ### DataFrame Validations
137 |
138 | **validatePresenceOfColumns()**
139 |
140 | ```python
141 | etl.meta.validatePresenceOfColumns(source_df, ["name", "age", "fun"])
142 | ```
143 |
144 | Raises an exception unless `source_df` contains the `name`, `age`, and `fun` column.
145 |
146 | **validateSchema()**
147 |
148 | ```python
149 | etl.meta.validateSchema(source_df, required_schema)
150 | ```
151 |
152 | Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`.
153 |
154 | **validateAbsenseOfColumns()**
155 |
156 | ```python
157 | etl.meta.validateAbsenseOfColumns(source_df, ["age", "cool"])
158 | ```
159 |
160 | Raises an exception if `source_df` contains `age` or `cool` columns.
161 |
162 | ### Functions
163 |
164 | **single_space()**
165 |
166 | ```python
167 | actual_df = source_df.withColumn(
168 | "words_single_spaced",
169 | etl.meta.single_space(col("words"))
170 | )
171 | ```
172 |
173 |
174 | Replaces all multispaces with single spaces (e.g. changes `"this has some"` to `"this has some"`.
175 |
176 | **remove_all_whitespace()**
177 |
178 | ```python
179 | actual_df = source_df.withColumn(
180 | "words_without_whitespace",
181 | etl.meta.remove_all_whitespace(col("words"))
182 | )
183 | ```
184 |
185 | Removes all whitespace in a string (e.g. changes `"this has some"` to `"thishassome"`.
186 |
187 | **anti_trim()**
188 |
189 | ```python
190 | actual_df = source_df.withColumn(
191 | "words_anti_trimmed",
192 | etl.meta.anti_trim(col("words"))
193 | )
194 | ```
195 |
196 | Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes `" this has some "` to `" thishassome "`.
197 |
198 | **remove_non_word_characters()**
199 |
200 | ```python
201 | actual_df = source_df.withColumn(
202 | "words_without_nonword_chars",
203 | etl.meta.remove_non_word_characters(col("words"))
204 | )
205 | ```
206 |
207 | Removes all non-word characters from a string (e.g. changes `"si%$#@!#$!@#mpsons"` to `"simpsons"`.
208 |
209 | **exists()**
210 |
211 | ```python
212 | source_df.withColumn(
213 | "any_num_greater_than_5",
214 | etl.meta.exists(lambda n: n > 5)(col("nums"))
215 | )
216 | ```
217 |
218 | `nums` contains lists of numbers and `exists()` returns `True` if any of the numbers in the list are greater than 5. It's similar to the Python `any` function.
219 |
220 | **forall()**
221 |
222 | ```python
223 | source_df.withColumn(
224 | "all_nums_greater_than_3",
225 | etl.meta.forall(lambda n: n > 3)(col("nums"))
226 | )
227 | ```
228 |
229 | `nums` contains lists of numbers and `forall()` returns `True` if all of the numbers in the list are greater than 3. It's similar to the Python `all` function.
230 |
231 | **multi_equals()**
232 |
233 | ```python
234 | source_df.withColumn(
235 | "are_s1_and_s2_cat",
236 | etl.meta.multi_equals("cat")(col("s1"), col("s2"))
237 | )
238 | ```
239 |
240 | `multi_equals` returns true if `s1` and `s2` are both equal to `"cat"`.
241 |
242 | ### Transformations
243 |
244 | **snakeCaseColumnNames()**
245 |
246 | ```python
247 | etl.meta.snakeCaseColumnNames(source_df)
248 | ```
249 |
250 | Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased.
251 |
252 | **sort_columns()**
253 |
254 | ```python
255 | etl.meta.sort_columns(source_df, "asc")
256 | ```
257 |
258 | Sorts the DataFrame columns in alphabetical order. Wide DataFrames are easier to navigate when they're sorted alphabetically.
259 |
260 | ### DataFrame Helpers
261 |
262 | **columnToList()**
263 |
264 | ```python
265 | etl.meta.columnToList(source_df, "name")
266 | ```
267 |
268 | Converts a column in a DataFrame to a list of values.
269 |
270 | **twoColumns2Dictionary()**
271 |
272 | ```python
273 | etl.meta.twoColumns2Dictionary(source_df, "name", "age")
274 | ```
275 |
276 | Converts two columns of a DataFrame into a dictionary. In this example, `name` is the key and `age` is the value.
277 |
278 | **toListOfDictionaries()**
279 |
280 | ```python
281 | etl.meta.toListOfDictionaries(source_df)
282 | ```
283 | Converts an entire DataFrame into a list of dictionaries.
284 |
--------------------------------------------------------------------------------
/docs/PysparkLocalSetup.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/PysparkLocalSetup.docx
--------------------------------------------------------------------------------
/docs/SETUP.MD:
--------------------------------------------------------------------------------
1 | # Datalake ETL Pipeline
2 | Data transformation simplified for any Data platform.
3 |
4 | `Features:` The package has complete ETL process -
5 | 1. Uses metadata, transformation & data model information to design ETL pipeline
6 | 2. Builds target transformation using both SparkSQL and Spark Dataframes
7 | \- Developer to choose option
8 | 3. Builds source & target Hive DDLs
9 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions.
10 | 5. Supports below fundamental transformations for ETL pipeline -
11 | * Filters on source & target dataframes
12 | * Grouping and Aggregations on source & target dataframes
13 | * Heavily nested queries / dataframes
14 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
15 | level of nesting
16 | 7. Has Unit test cases designed on function/method level & measures
17 | source code coverage
18 | 8. Has information about delpoying to higher environments
19 | 9. Has API documentation for customization & enhancement
20 |
21 | `Enhancements:` In progress -
22 | 1. Integrate Audit and logging - Define Error codes, log process failures, Audit progress & runtime information
23 |
24 | ## Setup for Python and Pyspark on Windows & Linux machines locally
25 |
26 | First please install 64 bit JDK 1.8 or 8 for your operating system from Oracle https://www.oracle.com/java/technologies/javase-jdk8-downloads.html
27 | set below below environment variable pointing to home directory of java (Please remember the path mentioned below may be different for you)
28 | ```
29 | JAVA_HOME=C:\Program Files\Java\jdk1.8.0_231
30 | ```
31 |
32 | Next, Install PyCharm community edition from https://www.jetbrains.com/pycharm/download/
33 |
34 |
35 | Make sure you have Python >= 3.0 and Do not use Python 3.8; Recommended is Python 3.7 and PySpark 2.3.4
36 | If you do not have pip tool, get it from https://pypi.org/project/pip/ and execute below command
37 | **If you already have Python & pip installed skip the below step**
38 | ```
39 | python get-pip.py
40 | ```
41 |
42 | Once you have Python and pip follow steps below -
43 | 1. Install below libraries for virtual environment (Linux)
44 | ```
45 | pip install virtualenvwrapper
46 | ```
47 | Windows -
48 | ```
49 | pip install virtualenvwrapper-win
50 | ```
51 | Set up the working directory for virtual environments (Linux)
52 | ```
53 | export WORKON_HOME=~/Envs
54 | mkdir -p $WORKON_HOME
55 | source /usr/local/bin/virtualenvwrapper.sh
56 | ```
57 | Windows
58 | ```
59 | Create directory C:\Users\\Envs
60 | WORKON_HOME=C:\Users\\Envs
61 | ```
62 |
63 | 2. Setup below **ENVIRONMENT VARIABLES**
64 |
65 | **Unix / Linux / Mac**
66 |
67 | `Please note: Your computer path may vary, use your computer path in below given in example `
68 | - **PYTHONPATH** - Full path to python executable
69 | ```
70 | export PYTHONPATH=/usr/python37
71 | ```
72 | - **PATH** - Update PATH variable add PYTHONPATH
73 | ```
74 | export PATH=$PATH%:$PYTHONPATH
75 | ```
76 | - **VIRTUALENV_PYTHON** - To create virtual environments. Path is same as PYTHONPATH
77 | ```
78 | export VIRTUALENV_PYTHON=$PYTHONPATH
79 | ```
80 | - **VIRTUALENVWRAPPER_VIRTUALENV** - Wrapper for Virtual Environment tools. Path is Scripts folder under PYTHONPATH
81 | ```
82 | VIRTUALENVWRAPPER_VIRTUALENV=$PYTHONPATH/Scripts
83 | ```
84 | **Windows**
85 |
86 | `Please note: Your computer path may vary, use your computer path in below given in example `
87 | - **PYTHONPATH** - Full path to python.exe
88 | ```
89 | PYTHONPATH=C:\Program Files\Python37
90 | ```
91 | - **PATH** - Update PATH variable add PYTHONPATH
92 | ```
93 | PATH=%PATH%;%PYTHONPATH%
94 | ```
95 | - **VIRTUALENV_PYTHON** - To create virtual environments. Path is same as PYTHONPATH
96 | ```
97 | VIRTUALENV_PYTHON=%PYTHONPATH%
98 | ```
99 | - **VIRTUALENVWRAPPER_VIRTUALENV** - Wrapper for Virtual Environment tools. Path is Scripts folder under PYTHONPATH
100 | ```
101 | VIRTUALENVWRAPPER_VIRTUALENV=%PYTHONPATH%\Scripts
102 |
103 | ```
104 | 4. Install Hadoop Binaries for Windows: winutils.exe; Note this is only for Windows. Linux users Download Spark libraries for Linux -
105 | - Download binaries from URL https://github.com/steveloughran/winutils
106 | - Unzip and place in some directory: For Example I'm using C:\winutils-master
107 | - Within extracted folder we have hadoop-2.7.1 --> Example: C:\winutils-master\hadoop-2.7.1
108 | - Declare an Environment variable HADOOP_HOME = C:\winutils-master\hadoop-2.7.1 and update PATH Variable
109 | ```
110 | HADOOP_HOME=C:\winutils-master\hadoop-2.7.1
111 | PATH=%PATH%;%HADOOP_HOME%\bin
112 | ```
113 | 5. Check if all the Environment Variables are working -
114 | - Open Command prompt
115 | - Type `python` it must open Python 3.7.x
116 | - Type `winutils`, `hadoop`, `hdfs` it must give help instructions of HDFS
117 |
118 | 3. Create a spark project -
119 | - Create a directory for example `datalake-etl-pipeline` anywhere in your computer; Here I'm using `/home/` and create a new virtual environment using commands below -
120 | ```
121 | mkvirtualenv -a -p py37
122 | workon py37
123 | cdproject
124 | ```
125 | `cdproject` command will switch to `datalake-etl-pipeline` folder
126 |
127 | Import required libraries from requirements.txt from `datalake-etl-pipeline` root folder, use command below –
128 | ```
129 | pip install -r requirements.txt
130 | ```
131 | To Freeze the existing installed packages use command below -
132 | ```
133 | pip freeze > requirements.txt
134 | ```
135 | # ToDo - Yet to add instructions for deploying into higher environments
136 | ## Delpoy to higher environment
--------------------------------------------------------------------------------
/docs/images/DataQualityUML.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/DataQualityUML.png
--------------------------------------------------------------------------------
/docs/images/XMLParse.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/XMLParse.png
--------------------------------------------------------------------------------
/docs/images/dq-task1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/dq-task1.png
--------------------------------------------------------------------------------
/docs/images/dq-task2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/dq-task2.png
--------------------------------------------------------------------------------
/docs/images/task1_ouput_er.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/task1_ouput_er.png
--------------------------------------------------------------------------------
/docs/images/task2_ouput_er.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/task2_ouput_er.png
--------------------------------------------------------------------------------
/docs/setup.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
25 |
Data transformation simplified for any Data platform.
26 |
Features:
The package has complete ETL process -
27 |
28 | Uses metadata, transformation & data model information to design ETL pipeline
29 | Builds target transformation SparkSQL and Spark Dataframes
30 | Builds source & target Hive DDLs
31 | Validates DataFrames, extends core classes, defines DataFrame transformations, and provides
32 | UDF SQL functions.
33 |
34 | Supports below fundamental transformations for ETL pipeline -
35 |
36 | Filters on source & target dataframes
37 | Grouping and Aggregations on source & target dataframes
38 | Heavily nested queries / dataframes
39 |
40 |
41 | Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth
42 | level of nesting
43 |
44 | Has Unit test cases designed on function/method level & measures
45 | source code coverage
46 |
47 | Has information about delpoying to higher environments
48 | Has API documentation for customization & enhancement
49 |
50 |
Enhancements:
In progress -
51 |
52 | Integrate Audit and logging - Define Error codes, log process failures, Audit progress
53 | & runtime information
54 |
55 |
56 |
57 |
Make sure you have Python >= 3.0.
58 | If you do not have pip tool, get it from https://pypi.org/project/pip/ and execute below command
59 | If you already have Python & pip installed skip the below step
60 |
python get-pip.py
61 |
62 |
Once you have Python and pip follow steps below -
63 |
64 |
65 | Install below libraries for virtual environment (Linux)
66 | pip install virtualenvwrapper
67 |
68 | Windows -
69 | pip install virtualenvwrapper-win
70 |
71 | Set up the working directory for virtual environments (Linux)
72 | export WORKON_HOME=~/Envs
73 | mkdir -p $WORKON_HOME
74 | source /usr/local/bin/virtualenvwrapper.sh
75 |
76 | Windows
77 | set WORKON_HOME=C:\Users\<username>\.Envs
78 | mkdir -p %WORKON_HOME%
79 | source /usr/local/bin/virtualenvwrapper.sh
80 |
81 |
82 |
83 | Setup below ENVIRONMENT VARIABLES
84 | Unix / Linux / Mac
85 | Please note: Your computer path may vary, use your computer
86 | path in below given in example
87 |
88 | PYTHONPATH - Full path to python executable
89 | export PYTHONPATH=/usr/python37
90 |
91 |
92 | PATH - Update PATH variable add PYTHONPATH
93 | export PATH=$PATH%:$PYTHONPATH
94 |
95 |
96 | VIRTUALENV_PYTHON - To create virtual
97 | environments. Path is same as PYTHONPATH
98 | export VIRTUALENV_PYTHON=$PYTHONPATH
99 |
100 |
101 | VIRTUALENVWRAPPER_VIRTUALENV - Wrapper for
102 | Virtual Environment tools. Path is Scripts folder under PYTHONPATH
103 | VIRTUALENVWRAPPER_VIRTUALENV=$PYTHONPATH/Scripts
104 |
105 |
106 |
107 | Windows
108 | Please note: Your computer path may vary, use your computer
109 | path in below given in example
110 |
111 | PYTHONPATH - Full path to python.exe
112 | PYTHONPATH=C:\Program Files\Python37
113 |
114 |
115 | PATH - Update PATH variable add PYTHONPATH
116 | PATH=%PATH%;%PYTHONPATH%
117 |
118 |
119 | VIRTUALENV_PYTHON - To create virtual
120 | environments. Path is same as PYTHONPATH
121 | VIRTUALENV_PYTHON=%PYTHONPATH%
122 |
123 |
124 | VIRTUALENVWRAPPER_VIRTUALENV - Wrapper for
125 | Virtual Environment tools. Path is Scripts folder under PYTHONPATH
126 | VIRTUALENVWRAPPER_VIRTUALENV=%PYTHONPATH%\Scripts
127 |
128 |
129 |
130 |
131 |
132 | Copy the Vitthal-datalake folder to your home folder /home/<username>
and create a new virtual environment using commands
134 | below -
135 | mkvirtualenv -a <path-to-marriot-datalake> -p <full-path-to-python.exe> py37
136 | workon py37
137 | cdproject
138 |
139 | cdproject
command will switch to Vitthal-datalake
140 | folder
141 | Import required libraries from requirements.txt from Vitthal-datalake root folder,
142 | use command below –
143 | pip install -r requirements.txt
144 |
145 |
146 |
147 |
150 |
151 |
152 |
--------------------------------------------------------------------------------
/logs/bash/logs:
--------------------------------------------------------------------------------
1 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [0m55729 - Executing create_python_venv.sh on m-c02f6224md6n with arguments
2 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [93mWARN 55729 - Checking if python3 is installed on machine[0m
3 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - /Library/Frameworks/Python.framework/Versions/3.9/bin/python3
4 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [0m55729 - python3 already installed on machine
5 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - [93mWARN 55729 - Checking if pip tool is installed on machine or attempt to download from internet & install[0m
6 | 2023-04-18T18:06:36+05:30: EXECUTION-LOG - pip 21.1.3 from /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pip (python 3.9)
7 | 2023-04-18T18:06:36+05:30: EXECUTION-LOG - [0m55729 - pip tool already available on machine
8 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Defaulting to user installation because normal site-packages is not writeable
9 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenvwrapper in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (4.8.4)
10 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenv in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (20.4.7)
11 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenv-clone in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (0.5.4)
12 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: stevedore in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (3.3.0)
13 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: pbr!=2.1.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from stevedore->virtualenvwrapper) (5.6.0)
14 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: six<2,>=1.9.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (1.16.0)
15 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: appdirs<2,>=1.4.3 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (1.4.4)
16 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: distlib<1,>=0.3.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (0.3.2)
17 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: filelock<4,>=3.0.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (3.0.12)
18 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - [33mDEBUG 55729 - Using below environment variables & path for virtual environment test[0m
19 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - [33mDEBUG 55729 - VIRTUALENVWRAPPER_PYTHON=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3[0m
20 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - [33mDEBUG 55729 - WORKON_HOME=~/python_venvs/[0m
21 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - created virtual environment CPython3.9.5.final.0-64 in 1454ms
22 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - creator CPython3Posix(dest=/Users/v0m02sj/python_venvs/test, clear=False, no_vcs_ignore=False, global=False)
23 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/Users/v0m02sj/Library/Application Support/virtualenv)
24 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - added seed packages: pip==21.1.2, setuptools==57.0.0, wheel==0.36.2
25 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator
26 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - Setting project for test to /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test
27 | 2023-04-18T18:06:46+05:30: EXECUTION-LOG - Removing test...
28 | 2023-04-18T18:06:47+05:30: EXECUTION-LOG - [0m55729 - Process finished successfully, logs can be found at /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/scripts/bin/../logs/log-python-venv-setup-test-2023-04-18-18.06.35.log
29 |
--------------------------------------------------------------------------------
/logs/python/log-sample:
--------------------------------------------------------------------------------
1 | 2023-04-19 01:32:44,837 root INFO 63460 init_logging Logging initiated; appending logs to /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/logs/python/log-Hello-World_2023-04-19T01-32-44-810431.log
2 | 2023-04-19 01:32:44,837 root WARNING 63460 main HelloFresh Recipes Data Engineering
3 | 2023-04-19 01:32:44,838 root WARNING 63460 get_or_create_spark_session Creating spark session first time with configs [{'key': 'spark.app.name', 'value': ''}]
4 | 2023-04-19 01:32:44,838 root INFO 63460 read_data_as_spark_dataframe Attempting to read json in spark using configs {'encoding': 'UTF-8'} from location /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/resources/input
5 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | psycopg2-binary
2 | psycopg2
3 | botocore
4 | boto3
5 | boto
6 | awscli
7 | dos2unix
8 | lxml
9 | mock
10 | moto
11 | urllib3
12 | tqdm
13 | xmlschema
14 | xlrd
15 | awsglue-local
16 | flask
17 | flask_cors
18 | coverage
19 | isodate
20 | py4j
21 | pyspark
22 | pyspark-stubs
23 | pytz
24 | tabulate
--------------------------------------------------------------------------------
/resources/data-quality-reports/recipe-tasks/task1-dq-report.html:
--------------------------------------------------------------------------------
1 |
2 | Team, Data Quality check finished successfully for DQ ID = 101 , with failures. Check details in
3 | below table of metrics.
4 | Failed DQ details
5 |
6 |
7 | Yarn Application Id
8 | DQ ID
9 | Rule ID
10 | Rule Name
11 | Rule type
12 | Description
13 | Columns/Query
14 | Pass Count
15 | Fail Count
16 | Total Count
17 |
18 |
19 | local-1682280549403
20 | 101
21 | 1011
22 | Primary / Natural Keys
23 | unique
24 | Primary / Natural Keys should not have duplicates
25 | ['name']
26 | 1039
27 | 3
28 | 1042
29 |
30 |
31 | local-1682280549403
32 | 101
33 | 1012
34 | NOT NULL fields
35 | not null
36 | Field should have valid value
37 | ['name', 'cookTime', 'prepTime']
38 | 715
39 | 327
40 | 1042
41 |
42 |
43 | local-1682280549403
44 | 101
45 | 1014
46 | "Check for invalid cook & prep time
47 | query
48 | Check empty or null values
49 | None
50 | 716
51 | 326
52 | 1042
53 |
54 |
55 | Succeeded DQ details
56 |
57 |
58 | Yarn Application Id
59 | DQ ID
60 | Rule ID
61 | Rule Name
62 | Rule type
63 | Description
64 | Columns/Query
65 | Pass Count
66 | Fail Count
67 | Total Count
68 |
69 |
70 | local-1682280549403
71 | 101
72 | 1013
73 | Input files check
74 | query
75 | Check If all input files are read for processing
76 | None
77 | 1042
78 | 0
79 | 1042
80 |
81 |
82 | Executed on 2023-04-24 01:39:19, Thanks
83 |
84 |
--------------------------------------------------------------------------------
/resources/data-quality-reports/recipe-tasks/task2-dq-report.html:
--------------------------------------------------------------------------------
1 |
2 | Team, Data Quality check finished successfully for DQ ID = 101 . Check details in below table of
3 | metrics.
4 | Succeeded DQ details
5 |
6 |
7 | Yarn Application Id
8 | DQ ID
9 | Rule ID
10 | Rule Name
11 | Rule type
12 | Description
13 | Columns/Query
14 | Pass Count
15 | Fail Count
16 | Total Count
17 |
18 |
19 | local-1682280549403
20 | 101
21 | 1015
22 | Primary / Natural Keys
23 | unique
24 | Primary / Natural Keys should not have duplicates
25 | ['difficulty']
26 | 3
27 | 0
28 | 3
29 |
30 |
31 | local-1682280549403
32 | 101
33 | 1016
34 | NOT NULL fields
35 | not null
36 | Field should have valid value
37 | ['difficulty', 'avg_total_cooking_time']
38 | 3
39 | 0
40 | 3
41 |
42 |
43 | Executed on 2023-04-24 01:39:24, Thanks
44 |
45 |
--------------------------------------------------------------------------------
/resources/data/clinical_trial/data/chunk1.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/clinical_trial/data/chunk1.zip
--------------------------------------------------------------------------------
/resources/data/clinical_trial/job_parameters/clinical_trial.json:
--------------------------------------------------------------------------------
1 | {
2 | "clinical_trial_etl": {
3 | "bucket": "dev",
4 | "landing_directory": "data/raw/clinical_trial_landing",
5 | "staging_directory": "data/raw/clinical_trial_staging",
6 | "download_url_prefix_test": "https://github.com/vim89/datalake-etl-pipeline/raw/master/src/resources/clinical_trial/data/chunk",
7 | "download_url_prefix": "https://clinicaltrials.gov/ct2/download_studies?down_chunk=",
8 | "max_chunk_range": 2,
9 | "download_target_filename": "clinical_studies.zip",
10 | "xml_closing_tag": "clinical_study",
11 | "xml_root_tag": "clinical_study",
12 | "xml_row_tag": "clinical_study",
13 | "xml_attribute_tag": "xml_attribute_value",
14 | "xml_attribute_prefix": "xmlattribute_",
15 | "xml_value_tag": "xml_value_tag",
16 | "audit_columns_definition": [
17 | "reverse(split(input_file_name(), '/'))[0] AS xml_file_name",
18 | "CAST('{ts}' AS TIMESTAMP) AS spark_timestamp"
19 | ],
20 | "audit_columns": [
21 | "xml_file_name",
22 | "spark_timestamp"
23 | ],
24 | "timestamp_column": [
25 | "spark_timestamp"
26 | ],
27 | "primary_keys": [
28 | "id_info.nct_id",
29 | "xml_file_name"
30 | ],
31 | "primary_keys_cascade_to_leaf_level_with_alias": [
32 | "id_info.nct_id AS pk_nct_id",
33 | "spark_timestamp AS spark_ts"
34 | ],
35 | "order_by_keys": [
36 | "spark_timestamp"
37 | ],
38 | "hashcode_column": [
39 | "hashcode"
40 | ],
41 | "target_primary_keys": [
42 | "nct_id"
43 | ],
44 | "hashcode_encryption_type": "md5",
45 | "cdc_staging_data_write_mode": "append",
46 | "audit_directory": "audit/",
47 | "job_name": "clinical_trial_etl"
48 | }
49 | }
--------------------------------------------------------------------------------
/resources/data/clinical_trial/sql/transformations/sponsors.sql:
--------------------------------------------------------------------------------
1 | SELECT
2 | nct_id,
3 | agency_class,
4 | lead_or_collaborator,
5 | name,
6 | CAST(last_update_timestamp AS TIMESTAMP) AS last_update_timestamp
7 | FROM
8 | (
9 | SELECT xmltable_sponsors_lead_sponsor.pk_nct_id AS nct_id, agency_class agency_class, 'lead' AS lead_or_collaborator,
10 | agency AS name, xmltable_sponsors_lead_sponsor.spark_ts AS last_update_timestamp
11 | FROM xmltable_sponsors_lead_sponsor
12 | LEFT JOIN xmltable_sponsors ON
13 | xmltable_sponsors.surrogate_id_xmltable_sponsors = xmltable_sponsors_lead_sponsor.surrogate_id_xmltable_sponsors
14 | AND xmltable_sponsors.pk_nct_id = xmltable_sponsors_lead_sponsor.pk_nct_id
15 |
16 | UNION ALL
17 |
18 | SELECT xmltable_sponsors_collaborator.pk_nct_id AS nct_id, agency_class AS agency_class,
19 | 'collaborator' AS lead_or_collaborator, agency AS name,
20 | xmltable_sponsors_collaborator.spark_ts AS last_update_timestamp
21 | FROM xmltable_sponsors_collaborator
22 | LEFT JOIN xmltable_sponsors ON xmltable_sponsors.surrogate_id_xmltable_sponsors = xmltable_sponsors_collaborator.surrogate_id_xmltable_sponsors
23 | AND xmltable_sponsors.pk_nct_id = xmltable_sponsors_collaborator.pk_nct_id
24 | ) sponsors
25 |
--------------------------------------------------------------------------------
/resources/data/config/application_properties.json:
--------------------------------------------------------------------------------
1 | {
2 | "version": 1,
3 | "default_settings": {
4 | "max_parallel_spark_submit_process": 6,
5 | "history_load_interval_in_days": 30
6 | },
7 | "command_line_args": [
8 | {
9 | "name": "workflow",
10 | "type": "string",
11 | "default": "None"
12 | },
13 | {
14 | "name": "startDate",
15 | "type": "string",
16 | "default": "None"
17 | },
18 | {
19 | "name": "endDate",
20 | "type": "string",
21 | "default": "None"
22 | },
23 | {
24 | "name": "refreshType",
25 | "type": "string",
26 | "default": "None"
27 | },
28 | {
29 | "name": "dq_enabled",
30 | "type": "string",
31 | "default": "N"
32 | },
33 | {
34 | "name": "configFile",
35 | "type": "string",
36 | "default": "/u/users/svcdvchnlperf/adhoc/config-prod.yml"
37 | }
38 | ],
39 | "spark_submit_options_order": {
40 | "spark-submit": {
41 | "priority": 0,
42 | "required": false,
43 | "value": ""
44 | },
45 | "--master": {
46 | "priority": 1,
47 | "required": true,
48 | "value": "yarn"
49 | },
50 | "--deploy-mode": {
51 | "priority": 2,
52 | "required": true,
53 | "value": "cluster"
54 | },
55 | "--executor-cores": {
56 | "priority": 3,
57 | "required": true,
58 | "value": 5
59 | },
60 | "--executor-memory": {
61 | "priority": 4,
62 | "required": true,
63 | "value": "4g"
64 | },
65 | "--num-executors": {
66 | "priority": 5,
67 | "required": true,
68 | "value": 20
69 | },
70 | "--driver-memory": {
71 | "priority": 6,
72 | "required": true,
73 | "value": "6g"
74 | },
75 | "--name": {
76 | "priority": 7,
77 | "required": false,
78 | "value": "Channel Performance Spark Job"
79 | },
80 | "--driver-java-options": {
81 | "priority": 8,
82 | "required": true,
83 | "value": ""
84 | },
85 | "--conf": {
86 | "priority": 9,
87 | "required": true,
88 | "value": "\"spark.executor.memory=4g\""
89 | },
90 | "--jars": {
91 | "priority": 10,
92 | "required": true,
93 | "value": "\"/u/users/svcdvchnlperf/adhoc/ScalaSparkArchetypeCore-1.9.3-bundled.jar\""
94 | },
95 | "--files": {
96 | "priority": 11,
97 | "required": true,
98 | "value": "\"/u/users/svcdvchnlperf/adhoc/connections/connection.yaml,/u/users/svcdvchnlperf/adhoc/connections/job.yaml\""
99 | },
100 | "--class": {
101 | "priority": 12,
102 | "required": true,
103 | "value": "com.walmartlabs.channel.perf.WorkflowController "
104 | },
105 | "--class_arguments": {
106 | "priority": 13,
107 | "required": false,
108 | "value": {
109 | "workflow": "",
110 | "dq_enabled": "",
111 | "startDate": "",
112 | "endDate": "",
113 | "refreshType": "",
114 | "configFile": ""
115 | }
116 | }
117 | },
118 | "spark_submit_options_filter": [
119 | "primary_keys",
120 | "ADHOC_SCHEMA_GCS_BUCKET",
121 | "STG_SCHEMA_GCS_BUCKET",
122 | "APP_SCHEMA_GCS_BUCKET",
123 | "ADHOC_SCHEMA",
124 | "STG_SCHEMA",
125 | "APP_SCHEMA",
126 | "env",
127 | "enableservices",
128 | "runmode",
129 | "srcrcvts",
130 | "userId"
131 | ]
132 | }
--------------------------------------------------------------------------------
/resources/data/config/application_properties.yaml:
--------------------------------------------------------------------------------
1 | version: 1
2 | default_settings:
3 | max_parallel_spark_submit_process: 5
4 | history_load_interval_in_days: 30
--------------------------------------------------------------------------------
/resources/data/config/logging.yaml:
--------------------------------------------------------------------------------
1 | version: 1
2 | objects:
3 | queue:
4 | class: queue.Queue
5 | maxsize: 1000
6 | formatters:
7 | simple:
8 | format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
9 | detailed:
10 | format: '%(asctime)s %(name)-15s %(levelname)-8s %(process)-10d %(funcName)-30s %(message)s'
11 | handlers:
12 | console:
13 | class: logging.StreamHandler
14 | level: DEBUG
15 | formatter: detailed
16 | stream: ext://sys.stdout
17 | console_colored:
18 | class: utils.logging_util.ColoredLogger
19 | name: 'Colored'
20 | file:
21 | class: logging.FileHandler
22 | level: DEBUG
23 | encoding: 'utf-8'
24 | formatter: detailed
25 | filename: ../../logs/log-data-pipeline_{timestamp_placeholder}.log
26 | mode: a
27 | queue:
28 | class: utils.logging_util.QueueListenerHandler
29 | level: DEBUG
30 | handlers:
31 | - cfg://handlers.console
32 | - cfg://handlers.file
33 | queue: cfg://objects.queue
34 | loggers:
35 | simpleExample:
36 | level: INFO
37 | handlers: [console, file, queue]
38 | propagate: no
39 | root:
40 | level: DEBUG
41 | handlers: [console, file]
--------------------------------------------------------------------------------
/resources/data/product.csv:
--------------------------------------------------------------------------------
1 | id,name ,price
2 | 1 ,Wrist Watch,10
3 | 2 ,Shoes ,8
4 | 3 ,Tshirt ,5
5 | 4 ,Jeans ,7
6 | 5 ,Sunglasses ,7
7 |
--------------------------------------------------------------------------------
/resources/data/purchase.csv:
--------------------------------------------------------------------------------
1 | id ,productid,purchasedate,storeid
2 | 100,1 ,10/11/2019 ,1000
3 | 101,3 ,10/12/2019 ,1002
4 | 102,2 , ,1004
5 | 103,1 ,10/14/2019 ,1004
6 | 104,4 ,10/15/2019 ,1003
7 | 105,4 ,10/16/2019 ,1002
8 |
--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/.part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/.part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc
--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/_SUCCESS
--------------------------------------------------------------------------------
/resources/data/recipes/output/task1/part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet
--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/._SUCCESS.crc:
--------------------------------------------------------------------------------
1 | crc
--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/.part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task2/.part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc
--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/_SUCCESS:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task2/_SUCCESS
--------------------------------------------------------------------------------
/resources/data/recipes/output/task2/part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv:
--------------------------------------------------------------------------------
1 | difficulty,avg_total_cooking_time
2 | easy,PT7M5.086705S
3 | hard,PT2H43M37.105263S
4 | medium,PT41M53.288136S
5 |
--------------------------------------------------------------------------------
/resources/data/store.csv:
--------------------------------------------------------------------------------
1 | id ,name
2 | 1000,Borivili
3 | 1001,Kandivili
4 | 1002,Andheri
5 | 1003,Bandra
6 | 1004,Dadar
7 | 1005,Byculla
8 |
--------------------------------------------------------------------------------
/sbin/common_functions.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #********************************************************#
3 | # Common bash reusable functions
4 | # library_functions.sh
5 | # April 2023
6 | #********************************************************#
7 |
8 | #*********************************************************
9 | # Comprehensive Logging
10 | #*********************************************************
11 |
12 | #--------------------------------------------------------------------
13 | # Prints a log statement
14 | # Parameter: (message) (level: DEBUG,INFO,ERROR,AUDIT)
15 | # Returns: N/A
16 | #--------------------------------------------------------------------
17 | log(){
18 | local msg=$1
19 | local lvl=$2
20 | if [ -z "${lvl}" ]; then
21 | lvl="INFO"
22 | fi
23 |
24 | ## 0=default; 31=red; 33=yellow, 93=light yellow; 34=blue
25 | # shellcheck disable=SC2155
26 | # shellcheck disable=SC2034
27 | local lts=$(date +%FT%T.%3N)
28 | case "${lvl}" in
29 | ("ERROR")
30 | >&2 echo -e "\e[31m${lvl}" $$ "-" "${msg}\e[0m"
31 | ;;
32 | ("WARN")
33 | echo -e "\e[93m${lvl}" $$ "-" "${msg}\e[0m"
34 | ;;
35 | ("AUDIT")
36 | echo -e "\e[34m${lvl}" $$ "-" "${msg}\e[0m"
37 | isCheckRequired=true
38 | ;;
39 | ("DEBUG")
40 | echo -e "\e[33m${lvl}" $$ "-" "${msg}\e[0m"
41 | # shellcheck disable=SC2034
42 | isCheckRequired=true
43 | ;;
44 | (*) echo -e "\e[0m"$$ "-" "${msg}"
45 | return 1
46 | ;;
47 | esac
48 | }
49 |
50 | #--------------------------------------------------------------------
51 | # Prints an error
52 | # Parameter: Error message
53 | # Returns: N/A
54 | #--------------------------------------------------------------------
55 | logError(){
56 | log "$1" "ERROR"
57 | }
58 |
59 | #--------------------------------------------------------------------
60 | # Prints a warn message
61 | # Parameter: Error message
62 | # Returns: N/A
63 | #--------------------------------------------------------------------
64 | logWarn(){
65 | log "$1" "WARN"
66 | }
67 |
68 | #--------------------------------------------------------------------
69 | # Prints an audit message
70 | # Parameter: Error message
71 | # Returns: N/A
72 | #--------------------------------------------------------------------
73 | logAudit(){
74 | log "$1" "AUDIT"
75 | }
76 |
77 | #--------------------------------------------------------------------
78 | # Prints a debug message
79 | # Parameter: Error message
80 | # Returns: N/A
81 | #--------------------------------------------------------------------
82 | logDebug(){
83 | log "$1" "DEBUG"
84 | }
85 |
86 | #--------------------------------------------------------------------
87 | # Performs cleanup task
88 | # Parameter: N/A
89 | # Returns: N/A
90 | #---------------------------------------------------------------------
91 | cleanup(){
92 | log "Process finished successfully, logs can be found at ${LOG_FILE}"
93 | }
94 |
95 | #--------------------------------------------------------------------
96 | # Called using trap on SIGINT, SIGQUIT, SIGABRT, SIGALRM, SIGTERM
97 | # Parameter: Error message
98 | # Returns: N/A
99 | #---------------------------------------------------------------------
100 | interrupt(){
101 | logError "Process got interrupted with exit code $?! Check error logs in ${LOG_FILE}"
102 | exit 1
103 | }
104 |
105 | #--------------------------------------------------------------------
106 | # Displays a loading indicator for background jobs
107 | # Parameter: Subprocess pid
108 | # Returns: N/A
109 | #---------------------------------------------------------------------
110 | loadingIndicator(){
111 | local pid=$1
112 | spin='-\|/'
113 |
114 | local i=0
115 | while kill -0 $pid 2>/dev/null
116 | do
117 | i=$(( (i+1) %4 ))
118 | # shellcheck disable=SC2059
119 | printf "\r${spin:$i:1}"
120 | sleep .1
121 | done
122 | }
123 |
--------------------------------------------------------------------------------
/sbin/create_python_venv.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | #********************************************************#
3 | # Python Virtual Environment setup
4 | # create_python_venv.sh
5 | # April 2023
6 | #********************************************************#
7 |
8 | #--------------------------------------------------------------------
9 | # Prints usage of script, non-zero exit in case of incorrect usage
10 | # Parameter: N/A
11 | # Returns: N/A
12 | #--------------------------------------------------------------------
13 | scriptUsage() {
14 | logError "Usage: ${SCRIPT_NAME} [./create_python_venv.sh -n VIRTUAL_ENV_NAME]"
15 | logError "Do not use 'sh' shell to run the script; use 'bash' or ./create_python_venv.sh " 1>&2
16 | exit 1;
17 | }
18 |
19 | #-----------------------------------------------------------------------
20 | # Checks if python3 exists otherwise exit with non-zero
21 | # Parameter: N/A
22 | # Returns: 0 if python3 exist else exit with non-zero
23 | #-----------------------------------------------------------------------
24 | python3Exists() {
25 | logWarn "Checking if python3 is installed on machine"
26 | if ! which python3; then
27 | logError "python3 is not installed in the machine, please install python3 as base to create virtual environments on top of base python"
28 | exit 1;
29 | fi
30 | log "python3 already installed on machine"
31 | return 0;
32 | }
33 |
34 | #-----------------------------------------------------------------------
35 | # Checks if pip tool exists otherwise downloads from internet to install
36 | # Exit with non-zero in case of poor or no internet connection
37 | # Parameter: N/A
38 | # Returns: N/A
39 | #-----------------------------------------------------------------------
40 | pipExists() {
41 | logWarn "Checking if pip tool is installed on machine or attempt to download from internet & install"
42 | if ! pip --version; then
43 | if ! curl https://bootstrap.pypa.io/get-pip.py --output get-pip.py; then
44 | logError "Error downloading file from the internet; check your internet connection & proxy settings"
45 | exit 1;
46 | else
47 | log "Downloaded get-pip.py successfully"
48 | if ! python get-pip.py; then
49 | logError "Error installing pip, check logs"
50 | exit 1;
51 | fi
52 | log "pip installed successfully, upgrading"
53 | python3 -m pip install --upgrade pip
54 | return 0
55 | fi
56 | else
57 | log "pip tool already available on machine, upgrading"
58 | python3.9 -m pip install --upgrade pip
59 | return 0
60 | fi
61 | }
62 |
63 | #--------------------------------------------------------------------
64 | # Installs virtualenvwrapper
65 | # Exit with non-zero in case of any error during installation,
66 | # Parameter: N/A
67 | # Returns: 0 if installation is successful, non-zero exit otherwise
68 | #--------------------------------------------------------------------
69 | installVEnvWrapper() {
70 | mkdir -p "$HOME/python_venvs/"
71 | if ! pip install virtualenvwrapper; then
72 | logError "Error installing virtualenvwrapper using pip; check logs & check your internet connection & proxy"
73 | exit 1;
74 | fi
75 | return 0
76 | }
77 |
78 | #--------------------------------------------------------------------
79 | # Creates python virtual environment by given name
80 | # Exit with non-zero in case of any error during creation,
81 | # Parameter: virtualEnvName: name of the virtual environment to create
82 | # Returns: N/A
83 | #--------------------------------------------------------------------
84 | createVirtualEnv() {
85 | local virtualEnvName
86 | virtualEnvName=$(echo "$1" | xargs) #xargs is to trim
87 | local python3FullPath
88 | python3FullPath=$(which python3)
89 | export VIRTUALENVWRAPPER_PYTHON="${python3FullPath}"
90 | export WORKON_HOME="$HOME/python_venvs/"
91 | export PROJECT_HOME="${HOME_DIRECTORY}/../"
92 | logDebug "Using below environment variables & path for virtual environment ${virtualEnvName}"
93 | logDebug "VIRTUALENVWRAPPER_PYTHON=${VIRTUALENVWRAPPER_PYTHON}"
94 | logDebug "WORKON_HOME=${WORKON_HOME}"
95 |
96 | source virtualenvwrapper.sh
97 |
98 | rmvirtualenv "${virtualEnvName}"
99 |
100 | if ! mkvirtualenv -a "${HOME_DIRECTORY}/../" -p "${python3FullPath}" "${virtualEnvName}";then
101 | logError "Error creating virtual environment ${virtualEnvName}"
102 | exit 1;
103 | fi
104 | }
105 |
106 | #------------------------------------------------------------------------------
107 | # installs required packages for virtual environment given in requirements.txt
108 | # Exit with non-zero in case of any error during installation,
109 | # Parameter: virtualEnvName: name of the virtual environment to create
110 | # Returns: N/A
111 | #------------------------------------------------------------------------------
112 | installRequiredPackages() {
113 | local virtualEnvName
114 | virtualEnvName=$(echo "$1" | xargs) #xargs is to trim
115 | workon "${virtualEnvName}"
116 | cdproject
117 | pip install -r requirements.txt
118 | # pip freeze > requirements.txt
119 | python3 -m pip install --upgrade pip
120 | # source activate
121 | }
122 |
123 | #************************************************************************
124 | #
125 | # MAIN SCRIPTS STARTS HERE
126 | #
127 | #************************************************************************
128 |
129 | # Execute ./create_python_venv.sh -n hello-fresh-data-engg
130 |
131 | # Read initial variables
132 | HOST_NAME=`hostname`
133 | USER=`whoami`
134 | SCRIPT_NAME=$(basename "$0")
135 | HOME_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
136 | cd "${HOME_DIRECTORY}" || exit # exit in case cd fails; very rare
137 | source "${HOME_DIRECTORY}/common_functions.sh"
138 |
139 | export SETUPTOOLS_USE_DISTUTILS=stdlib
140 |
141 | # trap interrupts 0 SIGHUP SIGINT SIGQUIT SIGABRT SIGALRM SIGTERM
142 | trap interrupt 1 2 3 6 14 15
143 | trap cleanup 0
144 |
145 | while getopts ":n:" arg; do
146 | case "${arg}" in
147 | n)
148 | VENV_NAME=${OPTARG}
149 | ;;
150 | *)
151 | scriptUsage
152 | ;;
153 | esac
154 | done
155 | shift $((OPTIND-1))
156 |
157 | if [[ -z ${VENV_NAME} ]]; then
158 | logError "Empty virtual environment name"
159 | scriptUsage
160 | fi
161 |
162 | mkdir -p "${HOME_DIRECTORY}/../logs/bash/"
163 | LOG_FILE="${HOME_DIRECTORY}/../logs/bash/log-python-venv-setup-${VENV_NAME}-$(date +%F-%H.%M.%S).log"
164 | # Global log redirect
165 | exec &> >(while read -r line; do printf '%s %s\n' "$(date -Iseconds): EXECUTION-LOG - $line"; done | tee -a "${LOG_FILE}" )
166 |
167 | log "Executing $SCRIPT_NAME on $HOST_NAME with arguments"
168 |
169 | if python3Exists && pipExists; then
170 | installVEnvWrapper
171 | createVirtualEnv "${VENV_NAME}"
172 | installRequiredPackages "${VENV_NAME}"
173 | fi
174 |
175 | exit 0;
176 |
--------------------------------------------------------------------------------
/sbin/execute-tasks-spark-submit.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Read initial variables
4 | HOME_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
5 | cd "${HOME_DIRECTORY}" || exit # exit in case cd fails; very rare
6 | export PYTHONPATH=${PYTHONPATH}:"${HOME_DIRECTORY}/../src/"
7 | export VIRTUALENVWRAPPER_PYTHON="$(which python3)"
8 | export WORKON_HOME="$HOME/python_venvs/"
9 | export PROJECT_HOME="${HOME_DIRECTORY}../"
10 | source virtualenvwrapper.sh
11 | source "$HOME/python_venvs/hello-fresh-data-engg/bin/activate"
12 |
13 | while getopts ":e:c:m:" arg; do
14 | case "${arg}" in
15 | e)
16 | NUM_EXECS=${OPTARG}
17 | ;;
18 | c)
19 | EXEC_CORES=${OPTARG}
20 | ;;
21 | m)
22 | EXEC_MEM=${OPTARG}
23 | ;;
24 | *)
25 | scriptUsage
26 | ;;
27 | esac
28 | done
29 | shift $((OPTIND-1))
30 |
31 | if [[ -z ${NUM_EXECS} || -z ${EXEC_CORES} || -z ${EXEC_MEM} ]]; then
32 | NUM_EXECS="2"
33 | NUM_CORES="1"
34 | EXEC_MEM="1g"
35 | fi
36 |
37 | FILES="${HOME_DIRECTORY}/../conf/data-quality/rules/production_configs/recipe-task1-dq-rules.json,${HOME_DIRECTORY}/../conf/data-quality/rules/production_configs/recipe-task2-dq-rules.json,${HOME_DIRECTORY}/../conf/spark/log4j.properties"
38 |
39 | spark-submit \
40 | --master local[*] \
41 | --name "HelloFresh Data Engineering Recipe tasks" \
42 | --driver-memory 1g \
43 | --num-executors "${NUM_EXECS}" \
44 | --executor-cores "${NUM_CORES}" \
45 | --executor-memory "${EXEC_MEM}" \
46 | --conf spark.dynamicAllocation.enabled=false \
47 | --conf spark.yarn.maxAppAttempts=1 \
48 | --conf spark.serializer="org.apache.spark.serializer.KryoSerializer" \
49 | --conf spark.driver.extraJavaOptions="-Dlog4j.configuration=log4j.properties" \
50 | --files "${FILES}" \
51 | ../src/com/vitthalmirji/datapipelines/recipe_tasks.py --input-data-dir "${HOME_DIRECTORY}/../resources/data/input" --output-data-dir "${HOME_DIRECTORY}/../resources/data/output"
52 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | import subprocess
2 | import sys
3 |
4 | from setuptools.command.install import install
5 | from setuptools import setup, find_packages
6 |
7 | with open('requirements.txt') as f:
8 | requirements = f.read().splitlines()
9 |
10 | tests_require = ['pytest', 'pytest-cov', 'coverage']
11 |
12 | with open("docs/ETL_README.md", "r") as f:
13 | long_description = f.read()
14 |
15 |
16 | class ShellInstall(install):
17 | def run(self):
18 | if not sys.platform.startswith("linux"):
19 | print('Your platform {} might not be supported'.format(sys.platform))
20 | else:
21 | print('Running create_python_venv.sh -n hello-fresh-data-engg')
22 | subprocess.call(['./sbin/create_python_venv.sh', '-n', 'hello-fresh-data-engg'])
23 | install.run(self)
24 |
25 |
26 | setup(
27 | cmdclass={'install': ShellInstall},
28 | name='datapipelines-essentials',
29 | version='2.0',
30 | author='Vitthal Mirji',
31 | author_email='vitthalmirji@gmail.com',
32 | url='https://vitthalmirji.com',
33 | description='Datalake complex transformations simplified in PySpark',
34 | long_description='Simplified ETL process in Hadoop using Apache Spark. '
35 | 'SparkSession extensions, DataFrame validation, Column extensions, SQL functions, and DataFrame '
36 | 'transformations',
37 | long_description_content_type="text/markdown",
38 | install_requires=requirements,
39 | tests_require=tests_require,
40 | extras_require={
41 | 'test': tests_require,
42 | 'all': requirements + tests_require,
43 | 'docs': ['sphinx'] + tests_require,
44 | 'lint': []
45 | },
46 | license="GNU :: GPLv3",
47 | include_package_data=True,
48 | packages=find_packages(where='src', include=['com*']),
49 | package_dir={"": "src"},
50 | setup_requires=['setuptools'],
51 | classifiers=[
52 | "Programming Language :: Python :: 3",
53 | "License :: GNU :: GPLv3",
54 | "Operating System :: Linux",
55 | ],
56 | dependency_links=[],
57 | python_requires='>=3.7,<=3.9.5',
58 | keywords=['apachespark', 'spark', 'pyspark', 'etl', 'hadoop', 'bigdata', 'apache-spark', 'python', 'python3',
59 | 'data', 'dataengineering', 'datapipelines']
60 | )
61 |
--------------------------------------------------------------------------------
/src/com/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/datapipelines/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datapipelines/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/datapipelines/clinical_trial/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datapipelines/clinical_trial/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/datawarehousing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datawarehousing/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/datawarehousing/change_data_capture.py:
--------------------------------------------------------------------------------
1 | import hashlib
2 |
3 | from pyspark.sql import SparkSession, DataFrame, Window
4 | from pyspark.sql.functions import col, row_number
5 |
6 | from com.vitthalmirji.utils.Utilities import is_null_or_empty
7 |
8 |
9 | def append_audit_attributes_to_xml(file, file_contents, xml_closing_tag):
10 | hash_val = hashlib.md5(file_contents.encode('utf-8')).hexdigest()
11 | return str(file_contents).replace(f'{xml_closing_tag}>',
12 | f'{hash_val} '
14 | f'{str(file)} '
15 | f'{xml_closing_tag}>')
16 |
17 |
18 | def add_row_number_to_dataframe(dataframe: DataFrame, primary_keys, order_by_keys, eliminate_duplicate_records=False,
19 | drop_row_number_column=False):
20 | window = Window.partitionBy(
21 | *list(map(lambda c: col(c), primary_keys))).orderBy(
22 | *list(map(lambda c: col(c).desc(), order_by_keys)))
23 | row_num_col = row_number().over(window=window).alias('row_num')
24 |
25 | if eliminate_duplicate_records and drop_row_number_column:
26 | return dataframe.withColumn(colName='row_num', col=row_num_col).filter('row_num = 1').drop('row_num')
27 | elif eliminate_duplicate_records:
28 | return dataframe.withColumn(colName='row_num', col=row_num_col).filter('row_num = 1')
29 | else:
30 | return dataframe.withColumn(colName='row_num', col=row_num_col)
31 |
32 |
33 | def add_audit_columns(_df: DataFrame) -> DataFrame:
34 | import datetime
35 | ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
36 | df: DataFrame = _df
37 | sel_cols = list(map(lambda x: str(f'`{x}`'), df.schema.names))
38 | sel_cols.append(f"reverse(split(input_file_name(), '/'))[0] AS spark_file_name")
39 | sel_cols.append(f"CAST('{ts}' AS TIMESTAMP) AS spark_timestamp")
40 | print(sel_cols)
41 | df: DataFrame = df.selectExpr(sel_cols)
42 | return df
43 |
44 |
45 | def identify_new_records(spark: SparkSession, old_dataframe: DataFrame, new_dataframe: DataFrame,
46 | primary_keys=[], order_by_keys=['current_timestamp']) -> DataFrame:
47 | old_df = "old_df"
48 | new_df = "new_df"
49 |
50 | if is_null_or_empty(primary_keys):
51 | print("WARNING - Empty primary keys given: Assuming all fields in the table for Deduplication")
52 | dedup_query = f"SELECT *FROM (SELECT t1.*, row_number() over (order by {','.join(order_by_keys)} desc) as row_num FROM {old_df} t1) WHERE row_num = 1"
53 | elif is_null_or_empty(old_dataframe) and is_null_or_empty(
54 | new_dataframe) and new_dataframe.count() <= 0 and old_dataframe.count() <= 0:
55 | print("Empty Dataframes")
56 | return None
57 | elif not is_null_or_empty(new_dataframe) and new_dataframe.count() > 0 and (
58 | is_null_or_empty(old_dataframe) or old_dataframe.count() <= 0):
59 | print("Assuming initial load CDC not required")
60 | return new_dataframe
61 | else:
62 | print(f"Before CDC Staging count = {old_dataframe.count()}")
63 | dedup_query = f"SELECT *FROM (SELECT t1.*, row_number() over (partition by {','.join(primary_keys)} order by {','.join(order_by_keys)} desc) as row_num FROM {old_df} t1) WHERE row_num = 1"
64 | old_dataframe.createOrReplaceTempView(old_df)
65 | new_dataframe.createOrReplaceTempView(new_df)
66 | spark.sql(dedup_query).createOrReplaceTempView(old_df)
67 |
68 | join_condition = list(map(lambda x: str(f'{old_df}.{x} = {new_df}.{x}'), primary_keys))
69 | exclude_condition = list(map(lambda x: str(f'{old_df}.{x} IS NULL'), primary_keys))
70 | new_pks_query = f"SELECT {new_df}.* FROM {new_df} LEFT JOIN {old_df} ON {' AND '.join(join_condition)} WHERE {' AND '.join(exclude_condition)}"
71 | updates_query = f"SELECT {new_df}.* FROM {new_df} INNER JOIN {old_df} ON {' AND '.join(join_condition)} WHERE {new_df}.hashcode <> {old_df}.hashcode"
72 | print(f"Fetch only New PK records query = {new_pks_query}")
73 | print(f"Fetch updated records query = {updates_query}")
74 | new_pk_records_df: DataFrame = spark.sql(new_pks_query).dropDuplicates()
75 | updates_df: DataFrame = spark.sql(updates_query).dropDuplicates()
76 |
77 | return new_pk_records_df.union(updates_df)
78 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/CColumn.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.types import *
2 |
3 | from com.vitthalmirji.etl import ETL
4 |
5 |
6 | class CColumn:
7 | def __init__(self, colname, coldatatype, pk, filterclause, udf="", udfargs=[], casttype="", aliasname=""):
8 | self.colname = colname
9 | self.coldatatype = coldatatype
10 | self.pk = pk
11 | self.udf = udf
12 | self.udfargs = udfargs
13 | self.aliasname = aliasname
14 | self.casttype = casttype
15 | self.filterclause = filterclause
16 | self.selectexpression = ""
17 | self.matchmetatype = {
18 | 'tinyint': IntegerType(),
19 | 'smallint': IntegerType(),
20 | 'int': IntegerType(),
21 | 'bigint': LongType(),
22 | 'long': LongType(),
23 | 'float': FloatType(),
24 | 'double': DoubleType(),
25 | 'boolean': BooleanType(),
26 | 'string': StringType(),
27 | 'date': DateType(),
28 | 'timestamp': TimestampType(),
29 | 'binary': BinaryType()
30 | }
31 |
32 | def applyUdf(self):
33 | if ETL.isNullOrEmpty(self.udf) is None and len(self.udfargs) is 0:
34 | # tempcol: pyspark.sql.column.Column = col(str(self.colname))
35 | # tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname)
36 | self.selectexpression = f"CAST({self.colname} AS {self.casttype}) AS {self.aliasname},"
37 | elif ETL.isNullOrEmpty(self.udf) is not None and len(self.udfargs) is 0:
38 | # tempcol = col(self.colname)
39 | # kwargs = {'field': tempcol}
40 | # udfFunc = getattr(ETL, f"udf{str(self.udf).title()}")
41 | # tempcol = udfFunc(tempcol)
42 | # tempcol = tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname)
43 | self.selectexpression = f"CAST({self.udf}({self.colname}) AS {self.casttype}) AS {self.aliasname},"
44 | elif ETL.isNullOrEmpty(self.udf) is not None and len(self.udfargs) is not 0:
45 | # tempcol = col(self.colname)
46 | # udfFunc = getattr(ETL, f"udf{str(self.udf).title()}")
47 | # tempcol = udfFunc(tempcol)
48 | # tempcol = tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname)
49 | self.selectexpression = f"CAST({self.udf}({self.colname}, {','.join(self.udfargs)}) AS {self.casttype}) AS {self.aliasname}"
50 | else:
51 | self.selectexpression = f"CAST({self.colname} AS {self.casttype}) AS {self.aliasname},"
52 | return self.selectexpression
53 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/ETL.py:
--------------------------------------------------------------------------------
1 | import datetime
2 |
3 | import pyspark.sql.functions as f
4 | import pytz
5 | from pyspark.sql import SparkSession
6 | from pyspark.sql.types import *
7 |
8 | lookup = {}
9 |
10 |
11 | # ToDo - Yet to add many potential UDFs
12 |
13 | def registerAllUDF(sc: SparkSession):
14 | sc.udf.register(name='datetimetogmt', f=datetimeToGMT)
15 | sc.udf.register(name='zonedatetimetogmt', f=zoneDatetimeToGMTZone)
16 | sc.udf.register(name='isnullorempty', f=isNullOrEmpty)
17 | sc.udf.register(name='datetimetogmt', f=datetimeToGMT)
18 | sc.udf.register(name='udfnvl', f=udfNvl)
19 | sc.udf.register(name='udflookup', f=udfLookups)
20 |
21 |
22 | def datetimeToGMT(dt, fmt):
23 | local = pytz.timezone("America/Los_Angeles")
24 | # format = "%Y-%m-%d %H:%M:%S"
25 | naive = datetime.datetime.strptime(str(dt).strip(), str(fmt).strip())
26 | local_dt = local.localize(naive, is_dst=None)
27 | utc_dt = local_dt.astimezone(pytz.utc)
28 | return utc_dt
29 |
30 |
31 | def strSplitSep(s, sep=','):
32 | return str(s).split(str(sep))
33 |
34 |
35 | def varargsToList(*fields, sep):
36 | return str(sep).join(fields)
37 |
38 |
39 | def zoneDatetimeToGMTZone(dt, fmt, zone):
40 | local = pytz.timezone(str(zone).strip())
41 | # format = "%Y-%m-%d %H:%M:%S"
42 | naive = datetime.datetime.strptime(str(dt).strip(), str(fmt).strip())
43 | local_dt = local.localize(naive, is_dst=None)
44 | utc_dt = local_dt.astimezone(pytz.utc)
45 | return utc_dt
46 |
47 |
48 | @f.udf(returnType=StringType())
49 | def udfNvl(field):
50 | if isNullOrEmpty(field) is None:
51 | return "-"
52 | else:
53 | return field
54 |
55 |
56 | @f.udf(returnType=StringType())
57 | def udfLookups(clname, s):
58 | finallookupvalue = []
59 | if s is None:
60 | return ""
61 | else:
62 | codes = str(s).split(sep=';')
63 | for cd in codes:
64 | if f"{clname} {cd}" in lookup.keys():
65 | finallookupvalue.append(lookup[f"{clname} {cd}"])
66 | else:
67 | finallookupvalue.append(cd)
68 |
69 | return ';'.join(finallookupvalue)
70 |
71 |
72 | def squared_udf(s):
73 | if s is None:
74 | return None
75 | return s * s
76 |
77 |
78 | def nullString(s):
79 | return s is None or str(s).strip().__eq__("") is None
80 |
81 |
82 | def isNullOrEmpty(s):
83 | if s is None:
84 | return None
85 | if str(s).strip() is None or str(s).strip().__eq__(""):
86 | return None
87 | return str(s).strip()
88 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/ETLTransform.py:
--------------------------------------------------------------------------------
1 | import functools
2 |
3 | import pyspark.sql.functions as SparkSQLFunctions
4 | from pyspark.sql import DataFrame, SparkSession
5 |
6 | from com.vitthalmirji.etl import ETL
7 | from com.vitthalmirji.etl.ITable import SourceTable, TargetTable, matchEqualityOperator
8 | from com.vitthalmirji.etl.meta import MetaModel
9 | from com.vitthalmirji.etl.meta.MetaModel import MetaResult
10 |
11 |
12 | # ToDo - Source & Target group aggregations
13 |
14 | class Transform:
15 | def __init__(self, targettable, model: MetaModel, sc: SparkSession):
16 | self.model = model
17 | self.spark = sc
18 | self.sourcetables: list[SourceTable] = []
19 | self.targettable = targettable
20 | self.transformquery = ""
21 | self.joindict = {}
22 | self.sourcetablesdf: list[DataFrame] = []
23 | self.targetdf: DataFrame = None
24 | self.targetcolumnslist = []
25 | self.joincolumns = None
26 | self.jointype = None
27 |
28 | def genericDfOperation(self, operationFunc):
29 | return operationFunc(self)
30 |
31 | DataFrame.genericDfOperation = genericDfOperation
32 |
33 | def filterSourceTable(self, srctbl):
34 | srctbls = filter(lambda tbl: tbl.tablename == srctbl, self.sourcetables)
35 | return list(srctbls)
36 |
37 | def joinDataframes(self, dict1, dict2):
38 | targetdf: DataFrame = dict1['df'].join(dict2['df'], on=dict2['condition'], how=dict2['jointype'])
39 | return {'df': targetdf}
40 |
41 | def mapAggregationFunction(self, fieldname, functionname):
42 | if str(functionname).__eq__('min'):
43 | return SparkSQLFunctions.min(col=SparkSQLFunctions.col(fieldname))
44 | elif str(functionname).__eq__('max'):
45 | return SparkSQLFunctions.max(col=SparkSQLFunctions.col(fieldname))
46 | elif str(functionname).__eq__('count'):
47 | return SparkSQLFunctions.count(col=SparkSQLFunctions.col(fieldname))
48 | elif str(functionname).__eq__('sum'):
49 | return SparkSQLFunctions.sum(col=SparkSQLFunctions.col(fieldname))
50 | elif str(functionname).__eq__('avg'):
51 | return SparkSQLFunctions.avg(col=SparkSQLFunctions.col(fieldname))
52 |
53 | def applyJoin(self):
54 | self.query, self.joindict = self.model.joinSQL(self.model.datamodel, 'purchase', 'product', 'store')
55 |
56 | joinlist = []
57 | for k in self.joindict.keys():
58 | srctabledf: DataFrame = self.filterSourceTable(k)[0].targetdf
59 | self.joindict[k].update({'df': srctabledf})
60 | joinlist.append(self.joindict[k])
61 |
62 | self.targetdf: DataFrame = functools.reduce(self.joinDataframes, joinlist)['df']
63 |
64 | def applyFilters(self):
65 | tblinfo: MetaResult = self.model.filterMetaResultBySourceTable(self.sourcetables[0].tablename)
66 | targettable: TargetTable = TargetTable(sourcesystem=tblinfo.src_system, tablename=tblinfo.target_table, pk=[],
67 | database=tblinfo.target_database,
68 | filetype=tblinfo.target_filetype, filepath=tblinfo.target_file_path,
69 | modeltableorder=tblinfo.src_table_order)
70 |
71 | for metares in self.model.metaresultlist:
72 | filterexpr = matchEqualityOperator(expression=metares.src_col_filter)
73 | if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'):
74 | self.filterclause = f"{self.filterclause} {metares.target_col}{filterexpr}".strip()
75 |
76 | self.filterclause = self.filterclause.strip()
77 |
78 | if self.filterclause is None:
79 | self.filterclause = ""
80 |
81 | targettable.df: DataFrame = self.targetdf.filter(self.filterclause)
82 |
83 | def applyGroupAndAggregation(self):
84 | selectlist = []
85 | aggregations = {}
86 | for metares in self.model.filterMetaResultByTargetTable(self.targettable):
87 | if ETL.isNullOrEmpty(metares.target_col_aggregator) is not None:
88 | selectlist.append(metares.target_col)
89 | else:
90 | aggregations.update({
91 | metares.target_col: {
92 | 'function': metares.target_col_aggregator,
93 | 'filter': metares.target_col_aggregator_filter
94 | }
95 | })
96 |
97 | self.targetdf: DataFrame = self.targetdf.groupby(*selectlist).agg(SparkSQLFunctions.min)
98 |
99 | def transform(self):
100 | # Get Unique source table names for Transformation
101 | srctables = set()
102 | for metares in self.model.metaresultlist:
103 | srctables.add(metares.src_table)
104 |
105 | # For each source table create SourceTable object and assign transform columns
106 | for srctable in srctables:
107 | tablemetaresult = self.model.filterMetaResultBySourceTable(srctbl=srctable)
108 | tblinfo: MetaResult = tablemetaresult[0]
109 |
110 | fklist = []
111 |
112 | for item in self.model.datamodel.keys():
113 | if self.model.datamodel[item]['fk'] is not None or self.model.datamodel[item]['fk'] is {}:
114 | if srctable in self.model.datamodel[item]['fk'].keys():
115 | fklist.extend(self.model.datamodel[item]['fk'][srctable]['fk_pk'])
116 |
117 | sourcetable: SourceTable = SourceTable(sourcesystem=tblinfo.src_system, tablename=tblinfo.src_table,
118 | pk=self.model.datamodel[tblinfo.src_table]['pk'],
119 | fk=fklist,
120 | database=tblinfo.src_database, filepath=tblinfo.src_file_path,
121 | filetype=tblinfo.src_filetype,
122 | modeltableorder=tblinfo.src_table_order)
123 | self.sourcetables.append(sourcetable)
124 | for tbl in tablemetaresult:
125 | sourcetable.addColumn(name=tbl.src_col, type=tbl.src_col_datatype,
126 | pk=(True, False)[tbl.src_key_constraints.__eq__('pk')],
127 | udf=tbl.udf, udfargs=tbl.udfarguments, casttype=tbl.target_col_datatype,
128 | aliasname=tbl.target_col, filterclause=tbl.src_col_filter, fk={})
129 |
130 | # Read file as dataframe
131 | sourcetable.readFileFromSource(spark=self.spark)
132 |
133 | ETL.registerAllUDF(sc=self.spark)
134 | for sourcetable in self.sourcetables:
135 | sourcetable.applyTransform()
136 |
137 | self.applyJoin()
138 |
139 | self.applyFilters()
140 |
141 | self.applyGroupAggregation()
142 |
143 | self.targetdf.show()
144 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/ITable.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from abc import ABC
3 |
4 | from pyspark.sql import DataFrame, SparkSession
5 |
6 | from com.vitthalmirji.etl import ETL
7 | from com.vitthalmirji.etl.CColumn import CColumn
8 | from com.vitthalmirji.imports.HdfsImport import HdfsImport
9 |
10 |
11 | class ITable:
12 | sourcesystem: str
13 | tablename: str
14 | columnlist: []
15 | pk: []
16 | fk: []
17 | database: str
18 | filepath: str
19 | modeltableorder: int
20 |
21 | @abc.abstractmethod
22 | def getColumnList(self): []
23 |
24 | @abc.abstractmethod
25 | def getPkList(self): []
26 |
27 | @abc.abstractmethod
28 | def getFkList(self): []
29 |
30 | @abc.abstractmethod
31 | def getPath(self): str
32 |
33 | @abc.abstractmethod
34 | def getDatabaseName(self): str
35 |
36 | @abc.abstractmethod
37 | def readFileFromSource(self, park: SparkSession, opt={}, tbl=""): DataFrame
38 |
39 |
40 | def matchEqualityOperator(expression):
41 | expr = str(expression)
42 | if expr is None or expr.__eq__("None"):
43 | expr = str("")
44 | elif expr.find('eq(') != -1:
45 | expr = expr.replace('eq(', '=').replace(')', '')
46 | if expr.find('gt') != -1:
47 | expr = expr.replace('gt(', '>').replace(')', '')
48 | elif expr.find('lt') != -1:
49 | expr = expr.replace('lt(', '<').replace(')', '')
50 | elif expr.find('lte') != -1:
51 | expr = expr.replace('lte(', '<=').replace(')', '')
52 | elif expr.find('gte') != -1:
53 | expr = expr.replace('gte(', '>=').replace(')', '')
54 | elif expr.find('notin') != -1:
55 | expr = expr.replace('notin', 'NOT IN')
56 | elif expr.find('in') != -1:
57 | expr = expr.replace('in', 'IN')
58 | elif expr.find('ne') != -1:
59 | expr = expr.replace('ne(', '<>').replace(')', '')
60 | else:
61 | expr = expr.strip()
62 |
63 | if expr is None or expr.__eq__('None'):
64 | expr = ""
65 |
66 | return expr
67 |
68 |
69 | class SourceTable(ITable):
70 | def __init__(self, sourcesystem, tablename, pk, fk, database, filetype, filepath, modeltableorder):
71 | self.tablename = tablename
72 | self.pk = pk
73 | self.fk = fk
74 | self.database = database
75 | self.sourcesystem = sourcesystem
76 | self.filepath = filepath
77 | self.filetype = filetype
78 | self.modeltableorder = modeltableorder
79 | self.df: DataFrame = None
80 | self.columnlist: list[CColumn] = []
81 | self.filterclause = ""
82 |
83 | def getFilterCondition(self):
84 | return self.filterclause
85 |
86 | def addColumn(self, name, type, pk, udf, udfargs, casttype, aliasname, filterclause, fk={}) -> None:
87 | col = CColumn(colname=name, coldatatype=type, pk=pk, udf=udf, udfargs=udfargs, casttype=casttype,
88 | aliasname=aliasname, filterclause=filterclause)
89 |
90 | filterexpr = matchEqualityOperator(expression=filterclause)
91 | if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'):
92 | self.filterclause = f"{self.filterclause} {name}{filterexpr}".strip()
93 |
94 | self.filterclause = self.filterclause.strip()
95 |
96 | if self.filterclause is None:
97 | self.filterclause = ""
98 |
99 | self.columnlist.append(col)
100 |
101 | def getPkList(self) -> []:
102 | return self.pk
103 |
104 | def getFkList(self) -> []:
105 | return self.fk
106 |
107 | def getColumnList(self) -> []:
108 | return self.columnlist
109 |
110 | def getDatabaseName(self) -> str:
111 | return self.database
112 |
113 | def getPath(self) -> str:
114 | return self.filepath
115 |
116 | def readFileFromSource(self, spark: SparkSession, opt={}, tbl="") -> DataFrame:
117 | importModule = HdfsImport(spark=spark)
118 | sourcedf = importModule.readFromSource(location=self.filepath, filetype=self.filetype, opt=opt)
119 | self.df: DataFrame = sourcedf
120 | return sourcedf
121 |
122 | def getDf(self) -> DataFrame:
123 | return self.df
124 |
125 | def applyTransform(self):
126 | selectexpression = ""
127 | for _srccol in self.columnlist:
128 | srccol: CColumn = _srccol
129 | selectexpression = f"{selectexpression}{srccol.applyUdf()}"
130 |
131 | selectexpression = f"{selectexpression}--End"
132 | selectexpression = selectexpression.strip(',--End')
133 |
134 | for p in self.pk:
135 | selectexpression = f"{selectexpression}, {p} AS {self.tablename}{p}"
136 |
137 | for f in self.fk:
138 | selectexpression = f"{selectexpression}, {f}"
139 |
140 | if ETL.isNullOrEmpty(self.filterclause) is not None:
141 | self.targetdf: DataFrame = self.df.filter(self.filterclause).selectExpr(selectexpression)
142 | else:
143 | self.targetdf: DataFrame = self.df.selectExpr(selectexpression)
144 |
145 | return self.targetdf
146 |
147 |
148 | class TargetTable(ITable, ABC):
149 | def __init__(self, sourcesystem, tablename, pk, database, filetype, filepath, modeltableorder):
150 | self.tablename = tablename
151 | self.pk = pk
152 | self.database = database
153 | self.sourcesystem = sourcesystem
154 | self.filepath = filepath
155 | self.filetype = filetype
156 | self.modeltableorder = modeltableorder
157 | self.df: DataFrame = None
158 | self.columnlist: list[CColumn] = []
159 | self.sourcetableslist = list[SourceTable] = []
160 | self.filterclause = ""
161 | self.aggregationcolumns = []
162 | self.aggregationfilter = []
163 |
164 | def getPkList(self) -> []:
165 | return self.pk
166 |
167 | def getFkList(self) -> []:
168 | return self.fk
169 |
170 | def getColumnList(self) -> []:
171 | return self.columnlist
172 |
173 | def getDatabaseName(self) -> str:
174 | return self.database
175 |
176 | def getPath(self) -> str:
177 | return self.filepath
178 |
179 | def addColumn(self, name, type, pk, filterclause) -> None:
180 | col = CColumn(colname=name, coldatatype=type, pk=pk, filterclause=filterclause)
181 |
182 | filterexpr = matchEqualityOperator(expression=filterclause)
183 | if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'):
184 | self.filterclause = f"{self.filterclause} {name}{filterexpr}".strip()
185 |
186 | self.filterclause = self.filterclause.strip()
187 |
188 | if self.filterclause is None:
189 | self.filterclause = ""
190 |
191 | self.columnlist.append(col)
192 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/etl/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/etl/meta/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/etl/meta/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/imports/HdfsImport.py:
--------------------------------------------------------------------------------
1 | import abc
2 | from abc import ABC
3 |
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql.dataframe import DataFrame
6 |
7 | from com.vitthalmirji.etl import ETL
8 |
9 |
10 | class IImport:
11 | spark: SparkSession
12 | system: str
13 | table: str
14 |
15 | @abc.abstractmethod
16 | def readFromSource(self, location, filetype, opt={}, tbl=""): DataFrame
17 |
18 | @abc.abstractmethod
19 | def cleanup(self, location): None
20 |
21 |
22 | class HdfsImport(IImport, ABC):
23 | def __init__(self, spark: SparkSession):
24 | self.spark = spark
25 |
26 | def readFromSource(self, location, filetype, opt={}, tbl="") -> DataFrame:
27 | try:
28 | if str(filetype).lower().__eq__('tbl'):
29 | if ETL.isNullOrEmpty(tbl) is not None:
30 | try:
31 | _ = self.spark.read.table(tbl)
32 | except Exception as ex:
33 | print(f"Error reading table {tbl}")
34 | else:
35 | print(f"Invalid table {tbl} -Table do not exist in SQL Context: ")
36 | elif str(filetype).lower().__eq__('text'):
37 | return self.spark.read.text(paths=location, wholetext=True).toDF('line')
38 | elif str(filetype).lower().__eq__('csv'):
39 | return self.spark.read.options(header=True, inferSchema=True).csv(path=location)
40 | elif str(filetype).lower().__eq__('xml'):
41 | print(opt)
42 | return self.spark.read.format('com.databricks.spark.xml').options(rowTag='HotelDescriptiveContent',
43 | rootTag='HotelDescriptiveContents',
44 | valueTag='xmlvaluetag',
45 | attributePrefix="@").load(
46 | path=location)
47 | elif str(filetype).lower().__eq__('json'):
48 | return self.spark.read.options(options=opt).json(path=location)
49 | elif str(filetype).lower().__eq__('orc'):
50 | return self.spark.read.options(options=opt).orc(location)
51 | elif str(filetype).lower().__eq__('parquet'):
52 | return self.spark.read.options(options=opt).parquet(location)
53 | else:
54 | raise "Invalid filetype: " + filetype
55 | except Exception as ex:
56 | print("Error reading file in Spark of filetype " + filetype + " Error details: " + str(ex))
57 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/imports/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/imports/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/kafka/Logger.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 |
4 | from kafka import KafkaProducer
5 |
6 | logger_names = []
7 |
8 |
9 | class Logger(logging.Handler):
10 |
11 | def __init__(self, Jobname, hostlist, topic, tls=None):
12 | self.__level = "INFO"
13 | self.__formatter = "%(asctime)s %(levelname)-8s %(message)s"
14 | self.__local_file_path = Jobname + ".log"
15 | logging.Handler.__init__(self)
16 | self.producer = KafkaProducer(bootstrap_servers=hostlist,
17 | value_serializer=lambda v: json.dumps(v).encode('utf-8'),
18 | linger_ms=10)
19 | self.topic = topic
20 |
21 | def get(self, name):
22 | global logger_names
23 | logger = logging.getLogger(name)
24 | logger.setLevel(self.__level)
25 | if name not in logger_names:
26 | handler = logging.FileHandler(self.__local_file_path)
27 | formatter = logging.Formatter(self.__formatter)
28 | handler.setFormatter(formatter)
29 | handler.setLevel(self.__level)
30 | logger.addHandler(handler)
31 | logger_names.append(name)
32 | return logger
33 |
34 | # Write log to kafka topic
35 | def emit(self, record):
36 | # Avoid infinite loop by checking if Kafka's logs are looping in messages
37 | if 'kafka.' in record.name:
38 | return
39 | try:
40 | # apply the logger formatter
41 | msg = self.format(record)
42 | self.producer.send(self.topic, {'message': msg})
43 | self.flush(timeout=1.0)
44 | except Exception:
45 | logging.Handler.handleError(self, record)
46 |
47 | def flush(self, timeout=None):
48 | # Flush all the objects
49 | self.producer.flush(timeout=timeout)
50 |
51 | def close(self):
52 | # Close producer and clean up
53 | self.acquire()
54 | try:
55 | if self.producer:
56 | self.producer.close()
57 | logging.Handler.close(self)
58 | finally:
59 | self.release()
60 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/kafka/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/kafka/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/main.py:
--------------------------------------------------------------------------------
1 | # Main file
2 |
3 | # Create objects & invoke methods required for your ETL process
4 | import datetime
5 | import logging
6 |
7 | from utils.Utilities import init_logging
8 |
9 | if __name__ == '__main__':
10 | init_logging(log_time_stamp=datetime.datetime.now().isoformat().__str__())
11 | logging.debug("Hello")
12 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/mapper/Mapper.py:
--------------------------------------------------------------------------------
1 | import abc
2 |
3 | from pyspark.sql import SparkSession
4 | from pyspark.sql.dataframe import DataFrame
5 | from pyspark.sql.types import DataType, StructType, ArrayType, StructField, LongType
6 |
7 |
8 | class IMapper:
9 | @abc.abstractmethod
10 | def getDataframeSchema(self, df: DataFrame): DataFrame
11 |
12 | def createDDL(self, df: DataFrame, database, table, location): str
13 |
14 |
15 | def generate_deterministic_surrogate_key(spark: SparkSession, df: DataFrame, keyOffset=1, colName="keyName"):
16 | try:
17 | new_schema = StructType([StructField(colName, LongType(), True)] + df.schema.fields)
18 | new_rdd = df.rdd.zipWithIndex().map(lambda row: ([row[1] + keyOffset] + list(row[0])))
19 | max_key = new_rdd.map(lambda x: x[0]).max()
20 | final_df = spark.createDataFrame(new_rdd, new_schema)
21 | return final_df, max_key, "success", "errorNotFound"
22 | except Exception as e:
23 | return df, keyOffset, "error", e
24 |
25 |
26 | class ComplexDataMapper(IMapper):
27 | outerselects = []
28 |
29 | def __init__(self, sc):
30 | self.spark: SparkSession = sc
31 |
32 | def getDataframeSchema(self, df: DataFrame) -> StructType:
33 | return df.schema
34 |
35 | def createDDL(self, df: DataFrame, database, table, location):
36 | newline = '\n'
37 | ddl = str("")
38 | if database.__eq__(""):
39 | ddl = str(f"CREATE EXTERNAL TABLE {table} {newline}({newline}")
40 | else:
41 | ddl = str(f"CREATE EXTERNAL TABLE {database}.{table} {newline}({newline}")
42 |
43 | bigarraytypes: list[(str, str)] = None
44 |
45 | for field in df.schema.fields:
46 | if len(field.dataType.simpleString()) <= 100000:
47 | ddl = ddl + str(f"`{field.name}` {field.dataType.simpleString()},{newline}")
48 | else:
49 | print(f"Found big tag {field.name} skipping.. as the type definition exceeds more than value set in "
50 | f"Ambari > Hive > Configs > Advanced > Custom hive-site hive.metastore.max.typename.length=100000")
51 | # bigarraytypes += list[(field.name, field.dataType.sql)]
52 |
53 | ddl = ddl.rstrip(',\n')
54 |
55 | ddl += f"{newline}) {newline}" \
56 | f"STORED AS PARQUET {newline}" \
57 | f"LOCATION {location};{newline};"
58 |
59 | return ddl
60 |
61 | def createViews(self, df: DataFrame, root_table_name='xmltable',
62 | columns_cascade_to_leaf_level_with_alias=None) -> {}:
63 | views = {}
64 | views, xpaths = self.complexTypeIterator(viewname="", viewpath="", database="",
65 | table=root_table_name, level=0,
66 | dtype=df.schema, acc={}, root_table_name=root_table_name,
67 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level_with_alias)
68 | return views, xpaths
69 |
70 | def handleStructType(self, viewname, viewpath, database, table, level, dtype, columns_cascade_to_leaf_level, acc={},
71 | xpath=[]) -> {}:
72 | structtype: StructType = dtype
73 | selcols = []
74 | if columns_cascade_to_leaf_level is not None and len(columns_cascade_to_leaf_level) > 0:
75 | cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}"
76 | else:
77 | cascade_columns = ""
78 | if viewname is None or str(viewname).__eq__(""):
79 | viewname = table
80 | for field in structtype.fields:
81 | if str(field.dataType).lower().startswith("struct"):
82 | selcols.append(f"t{level}.`{field.name}`")
83 | viewname = field.name
84 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
85 | query = f"SELECT t{level}.`{field.name}`.*, t{level}.surrogate_id_{table}, " \
86 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
87 | f"{cascade_columns} " \
88 | f"FROM {table} t{level} "
89 | keynm = f"{table.replace('.', '_')}_{viewname}"
90 | acc.update({keynm: query})
91 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm, level=level,
92 | dtype=field.dataType, acc=acc,
93 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
94 | elif str(field.dataType).lower().startswith("array"):
95 | selcols.append(f"t{level}.`{field.name}`")
96 | arrtype: ArrayType = field.dataType
97 | if str(arrtype.elementType).lower().startswith("struct"):
98 | viewname = field.name
99 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
100 | query = f"SELECT v{level}.*, t{level}.surrogate_id_{table}, " \
101 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
102 | f"{cascade_columns} " \
103 | f"FROM {table} t{level} LATERAL VIEW INLINE(t{level}.`{field.name}`) v{level}"
104 | keynm = f"{table.replace('.', '_')}_{viewname}"
105 | acc.update({keynm: query})
106 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm,
107 | level=level + 1, dtype=arrtype.elementType, acc=acc,
108 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
109 | else:
110 | viewname = field.name
111 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
112 | query = f"SELECT v{level}.col AS {viewname}, " \
113 | f"t{level}.surrogate_id_{table}, " \
114 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
115 | f"{cascade_columns} " \
116 | f"FROM {table} t{level} " \
117 | f"LATERAL VIEW EXPLODE(t{level}.`{field.name}`) v{level}"
118 | keynm = f"{table.replace('.', '_')}_{viewname}"
119 | acc.update({keynm: query})
120 | xpath.append(f'{viewpath.replace(".", "/")}/{field.name}')
121 | else:
122 | xpath.append(f'{viewpath.replace(".", "/")}/{field.name}')
123 | selcols.append(f"t{level}.`{field.name}`")
124 |
125 | if len(selcols) > 0:
126 | query = f"SELECT {','.join(selcols)}, " \
127 | f"monotonically_increasing_id() AS surrogate_id_{table} " \
128 | f"{cascade_columns} " \
129 | f"FROM {table} t{level}"
130 | keynm = f"{table.replace('.', '_')}_{viewname}_outer"
131 | # acc.update({keynm: query})
132 | return acc
133 |
134 | def handleArrayType(self, viewname, viewpath, database, table, level, dtype: ArrayType,
135 | columns_cascade_to_leaf_level, acc={}, xpath=[]) -> {}:
136 | if columns_cascade_to_leaf_level is not None and len(columns_cascade_to_leaf_level) > 0:
137 | cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}"
138 | else:
139 | cascade_columns = ""
140 | if str(dtype.elementType).lower().startswith("struct"):
141 | arr_struct_type: StructType = dtype.elementType
142 | viewname = arr_struct_type.name
143 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}"
144 | query = f"SELECT v{level}.*, t{level}.surrogate_id_{table}," \
145 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
146 | f"{cascade_columns} " \
147 | f"FROM {table} t{level} " \
148 | f"LATERAL VIEW INLINE(t{level}.`{arr_struct_type.name}`) v{level}"
149 | keynm = f"{table.replace('.', '_')}_{viewname}"
150 | acc.update({keynm: query})
151 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm,
152 | level=level + 1, dtype=arr_struct_type, acc=acc,
153 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
154 | else:
155 | viewname = viewname
156 | viewpath = viewpath
157 | query = f"SELECT v{level}.col AS {viewname}, t{level}.surrogate_id_{table}, " \
158 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \
159 | f"{cascade_columns} " \
160 | f"FROM {table} t{level} " \
161 | f"LATERAL VIEW EXPLODE(t{level}.`{viewname}`) v{level}"
162 | keynm = f"{table.replace('.', '_')}_{viewname}"
163 | acc.update({keynm: query})
164 | xpath.append(f'{viewpath.replace(".", "/")}/{viewname}')
165 | return acc, xpath
166 |
167 | def complexTypeIterator(self, viewname, viewpath, database, table, level,
168 | dtype: DataType, root_table_name, columns_cascade_to_leaf_level, acc={}, xpath=[]) -> {}:
169 | if viewname is None or str(viewname).__eq__(""):
170 | keynm = f"{table.replace('.', '_')}"
171 | if columns_cascade_to_leaf_level is not None:
172 | cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}"
173 | else:
174 | cascade_columns = ""
175 | query = f"SELECT t{level}.*, " \
176 | f"monotonically_increasing_id() AS surrogate_id_{table} " \
177 | f"{cascade_columns} " \
178 | f"FROM {root_table_name} t{level}"
179 | acc.update({keynm: query})
180 | table = keynm
181 |
182 | columns_cascade_to_leaf_level = list(
183 | map(lambda c: f"{c.split('AS')[-1].strip()} AS {c.split('AS')[-1].strip()}",
184 | columns_cascade_to_leaf_level))
185 |
186 | if dtype.typeName().lower().__eq__("struct"):
187 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=table, level=level,
188 | dtype=dtype, acc=acc, xpath=[],
189 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
190 | elif dtype.typeName().lower().__eq__("array"):
191 | self.handleArrayType(viewname=viewname, viewpath=viewpath, database=database, table=table, level=level,
192 | dtype=dtype, acc=acc, xpath=[],
193 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level)
194 | else:
195 | xpath.append(f'{viewpath.replace(".", "/")}/{viewname}')
196 | return acc, xpath
197 | return acc, xpath
198 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/mapper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/mapper/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/objects/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/enums/Environments.py:
--------------------------------------------------------------------------------
1 | import abc
2 | import socket
3 |
4 | from pyspark.sql import SparkSession
5 |
6 |
7 | class Environment:
8 | def __init__(self, name, nameservice, zookeeperquorum, historyserver):
9 | self.name = name
10 | self.nameservice = nameservice
11 | self.zookeeperquorum = zookeeperquorum
12 | self.historyserver = historyserver
13 |
14 |
15 | class IEnvironment:
16 | @abc.abstractmethod
17 | def getEnvironment(self, sc: SparkSession): Environment
18 |
19 | def getEnvironmentByServer(self): Environment
20 |
21 |
22 | class Environments(IEnvironment):
23 | def __init__(self):
24 | self.local = Environment("local", "", "localhost", "localhost:18081")
25 | self.dev = Environment("dev", "", "localhost", "localhost:18081")
26 | self.intg = Environment("intg", "", "localhost", "localhost:18081")
27 | self.test = Environment("test", "", "localhost", "localhost:18081")
28 | self.prod = Environment("prod", "", "localhost", "localhost:18081")
29 |
30 | def getEnvironment(self, sc: SparkSession) -> Environment:
31 | hostname = socket.gethostname()
32 | if hostname.lower().startswith("v") or hostname.lower().startswith("u"):
33 | return self.local
34 | elif hostname.lower().startswith("intg"):
35 | return self.intg
36 | elif hostname.lower().startswith("test"):
37 | return self.test
38 | elif hostname.lower().startswith("prod"):
39 | return self.prod
40 |
41 | def getEnvironmentByServer(self) -> Environment:
42 | hostname = socket.gethostname()
43 | if hostname.lower().startswith("v") or hostname.lower().startswith("u"):
44 | return self.local
45 | elif hostname.lower().startswith("intg"):
46 | return self.intg
47 | elif hostname.lower().startswith("test"):
48 | return self.test
49 | elif hostname.lower().startswith("prod"):
50 | return self.prod
51 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/enums/Zones.py:
--------------------------------------------------------------------------------
1 | # ToDo - More extentions (if any) to various zones
2 | class Zones:
3 | def __init__(self):
4 | self.stage = "stage"
5 | self.work = "work"
6 | self.publish = "publish"
7 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/objects/enums/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/objects/enums/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/MockupData.py:
--------------------------------------------------------------------------------
1 | from random import Random
2 | from typing import Optional, Any
3 |
4 | from pyspark.sql.types import *
5 |
6 |
7 | # ToDo Yet to complete Random data generation
8 | class Maybe(object):
9 | def get_or_else(self, default):
10 | return self.value if isinstance(self, Just) else default
11 |
12 |
13 | class Just(Maybe):
14 | def __init__(self, value):
15 | self.value = value
16 |
17 |
18 | class Nothing(Maybe):
19 | pass
20 |
21 |
22 | # Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random
23 | # values; instead, they're biased to return "interesting" values (such as maximum / minimum values)
24 | # with higher probability.
25 | class MockupData:
26 | # The conditional probability of a non-null value being drawn from a set of "interesting" values
27 | # instead of being chosen uniformly at random.
28 | PROBABILITY_OF_INTERESTING_VALUE: float = 0.5
29 |
30 | # The probability of the generated value being null
31 | PROBABILITY_OF_NULL: float = 0.1
32 |
33 | MAX_STR_LEN: int = 1024
34 | MAX_ARR_SIZE: int = 128
35 | MAX_MAP_SIZE: int = 128
36 |
37 | # Returns a randomly generated schema, based on the given accepted types.
38 | # @param numFields the number of fields in this schema
39 | # @param acceptedTypes types to draw from.
40 | def randomSchema(self, rand: Random, numFields: int, acceptedTypes: list[DataType]) -> StructType:
41 | structfields = []
42 | i = 0
43 | while i < numFields:
44 | dt = acceptedTypes[rand.randint(1, len(acceptedTypes))]
45 | structfields.append(StructField(f"col_{i}", dt, nullable=bool(rand.getrandbits(1))))
46 | return StructType(structfields)
47 |
48 | # Returns a function which generates random values for the given `DataType`, or `None` if no
49 | # random data generator is defined for that data type. The generated values will use an external
50 | # representation of the data type; for example, the random generator for `DateType` will return
51 | # instances of [[java.sql.Date]] and the generator for `StructType` will return a [[Row]].
52 | # For a `UserDefinedType` for a class X, an instance of class X is returned.
53 | # #@param dataType the type to generate values for
54 | # @param nullable whether null values should be generated
55 | # @param rand an optional random number generator
56 | # @return a function which can be called to generate random values.
57 | def forType(self, dataType: DataType, nullable: bool, rand: Random = Random()) -> Optional[Any]:
58 | return Optional[Any]()
59 |
60 | # Generates a random row for `schema`.
61 | def randomRow(self, rand: Random, schema: StructType) -> Row:
62 | fields = list(StructField)
63 | for f in schema.fields:
64 | if str(f.dataType).lower().__eq__("arraytype"):
65 | data = None
66 | if f.nullable and rand.random() <= self.PROBABILITY_OF_NULL:
67 | data = None
68 | else:
69 | arr = []
70 | n = 1
71 | i = 0
72 | _f: ArrayType = f.dataType()
73 | generator = self.forType(_f.elementType, f.nullable, rand)
74 | assert (generator.isDefined, "Unsupported")
75 | gen = generator.get
76 | while i < n:
77 | arr.append(gen)
78 | i = i + 1
79 | data = arr
80 | fields.append(data)
81 | elif str(f.dataType).lower().__eq__("structtype"):
82 | _f: StructType = f
83 | for c in _f:
84 | fields.append(self.randomRow(rand, StructType(c.dataType())))
85 | else:
86 | generator = self.forType(f.dataType, f.nullable, rand)
87 | assert (generator.isDefined, "Unsupported")
88 | gen = generator.get
89 | fields.append(gen)
90 | return Row(*fields)
91 |
92 | # Returns a random nested schema. This will randomly generate structs and arrays drawn from
93 | # acceptedTypes.
94 | def randomNestedSchema(self, rand: Random, totalFields: int, acceptedTypes: list[DataType]) -> StructType:
95 | fields = []
96 | i = 0
97 | numFields = totalFields
98 | while numFields > 0:
99 | v = rand.randint(0, 3)
100 | if v is 0:
101 | # Simple type
102 | dt = acceptedTypes[rand.randint(0, len(acceptedTypes))]
103 | fields.append(StructField(f"col_{i}", dt, bool(rand.getrandbits(1))))
104 | numFields = -1
105 | elif v is 1:
106 | # Array
107 | dt = acceptedTypes[rand.randint(0, len(acceptedTypes))]
108 | fields.append(StructField(f"col_{i}", ArrayType(dt), bool(rand.getrandbits(1))))
109 | numFields = -1
110 | else:
111 | n = max(rand.randint(0, numFields), 1)
112 | nested = self.randomNestedSchema(rand, n, acceptedTypes)
113 | fields.append(StructField("col_" + i, nested, bool(rand.getrandbits(1))))
114 | numFields = numFields - n
115 |
116 | i = i + 1
117 | return StructType(fields)
118 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/utils/__init__.py
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/audit_util.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 |
4 | def audit_action(action):
5 | def audit_decorator(func):
6 | def audit(*args, **kwargs):
7 | # Invoke the wrapped function first
8 | retval = func(*args, **kwargs)
9 | # Now do something here with retval and/or action
10 | logging.debug(f'Executed {action}, Callback return value {retval}')
11 | return retval
12 | return audit
13 | return audit_decorator
14 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/comprehensive_logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import logging.config
3 | import sys
4 | from pathlib import Path
5 |
6 | from com.vitthalmirji.utils.constants import JOB_START_TIME
7 | from com.vitthalmirji.utils.helpers import read_json_get_dict, get_project_root
8 |
9 |
10 | def init_logging(job_name, log_time_stamp=JOB_START_TIME, log_path=f'{get_project_root()}/logs/python',
11 | log_properties_path=f"{get_project_root()}/conf/python/logging-properties.json"):
12 | """
13 | Initiates the logging object with given configurations
14 |
15 | Args:
16 | :param log_properties_path: Location of properties file.
17 | default to local project folder's /conf/python/logging-properties.json
18 | :param job_name: Name of the application
19 | :param log_time_stamp: Timestamp to append in log file name
20 | :param log_path: Location to store logs.
21 | Default location /logs/python/
22 |
23 | Returns: N/A
24 | """
25 | Path(log_path).mkdir(parents=True, exist_ok=True)
26 | log_conf = read_json_get_dict(json_path=log_properties_path)
27 | log_file = f"{log_path}/log-{job_name}_{log_time_stamp}.log"
28 | log_conf['handlers']['file']['filename'] = log_file
29 |
30 | # In case of Unit test cases do not log to file
31 | if 'unittest' in sys.modules.keys():
32 | log_conf['handlers'] = {'console': log_conf['handlers']['console']}
33 | log_conf['root']['handlers'] = ['console']
34 |
35 | print('Logging initiating using below properties')
36 | print(log_conf)
37 | logging.config.dictConfig(log_conf)
38 | logging.info(f'Logging initiated; appending logs to {log_file}')
39 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/constants.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime
2 |
3 | from com.vitthalmirji.utils.helpers import get_user
4 |
5 | USER = get_user()
6 | JOB_START_TIME = datetime.now().strftime('%Y-%m-%dT%H-%M-%S-%f')
7 | SPARK_APPLICATION_NAME = f"Spark application launched by {USER}"
8 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/data_quality.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | from typing import List
4 |
5 | from pyspark.sql import DataFrame
6 | from pyspark.sql.functions import count
7 |
8 | from com.vitthalmirji.utils.helpers import create_dir, log_exception_details
9 | from com.vitthalmirji.utils.spark import get_or_create_spark_session
10 |
11 |
12 | class Rule(object):
13 | def __init__(self, rule_id: int, name: str, description: str, rule_type: str, columns: List[str] = None,
14 | query: str = None):
15 | self.rule_id = rule_id
16 | self.name = name
17 | self.description = description
18 | self.rule_type = rule_type
19 | self.columns = columns if columns else None
20 | self.query = query if query else None
21 |
22 |
23 | class RuleExecutionResult:
24 | def __init__(self, rule: Rule, status, pass_count: int, fail_count: int, total_count):
25 | self.rule = rule
26 | self.status = status
27 | self.pass_count = pass_count
28 | self.fail_count = fail_count
29 | self.total_count = total_count
30 |
31 |
32 | class DataQuality(object):
33 | def __init__(self, dq_id: int, rules: list[dict] = None, email_execution_report_to: str = None,
34 | execution_reports_dir: str = None):
35 | logging.info(
36 | f"Initializing Data quality service for DQ ID {dq_id}, reports will be available in file {execution_reports_dir}")
37 | self.html_report = None
38 | self.df = None
39 | self.total_count = None
40 | self.execution_results = None
41 | self.dq_id = dq_id
42 | self.rules: List[Rule] = [Rule(**rule) for rule in rules] if rules else None
43 | self.email_execution_report_to = email_execution_report_to if email_execution_report_to else None
44 | self.spark = get_or_create_spark_session()
45 | self.yarn_id = self.spark.sparkContext.applicationId
46 | self.execution_reports_dir = execution_reports_dir if execution_reports_dir else None
47 | if self.execution_reports_dir:
48 | create_dir(self.execution_reports_dir)
49 |
50 | def execute_unique_rule(self, rule: Rule):
51 | """
52 | Executes Duplicates check on given Primary keys in `rule`
53 |
54 | Args:
55 | :param rule: Rule of type `unique` having list of primary keys
56 |
57 | Returns:
58 | :return: RuleExecutionResult with status fail if duplicates are present pass otherwise and count of duplicates
59 |
60 | Exceptions:
61 | :exception: Thrown by calling functions called in this function
62 | """
63 | logging.warning(f"Executing DQ Rule for {rule.name} on {rule.columns}")
64 | dups_count = self.df.select(rule.columns).groupby(rule.columns).agg(count("*").alias('cnt')).alias(
65 | 'cnt').filter('cnt > 1').count()
66 |
67 | return RuleExecutionResult(rule, 'fail' if dups_count > 0 else 'pass', self.total_count - dups_count,
68 | dups_count, self.total_count)
69 |
70 | def execute_not_null_rule(self, rule: Rule):
71 | """
72 | Executes Not null check on given list of columns in `rule`
73 |
74 | Args:
75 | :param rule: Rule of type `not null` having list of columns potentially not null
76 |
77 | Returns:
78 | :return: RuleExecutionResult with status fail if column values are null & pass otherwise and count of null records
79 |
80 | Exceptions:
81 | :exception: Thrown by calling functions called in this function
82 | """
83 | logging.warning(f"Executing DQ Rule for {rule.name} on {rule.columns}")
84 | filter_string = ' OR '.join(list(map(lambda c: f'{c} IS NULL OR TRIM({c}) = ""', rule.columns)))
85 | not_null_count = self.df.select(rule.columns).filter(filter_string).count()
86 | return RuleExecutionResult(rule, 'fail' if not_null_count > 0 else 'pass', self.total_count - not_null_count,
87 | not_null_count, self.total_count)
88 |
89 | def execute_query_rule(self, rule: Rule):
90 | """
91 | Executes query given in `rule`
92 | This is in case of custom data quality rule given in form of query
93 | Args:
94 | :param rule: Rule of type `query` having query to execute
95 |
96 | Returns:
97 | :return: RuleExecutionResult with status fail if duplicates are present pass otherwise and count of duplicates
98 |
99 | Exceptions:
100 | :exception: Thrown by calling functions called in this function
101 | """
102 | self.df.createOrReplaceTempView('temp')
103 | query = rule.query
104 | logging.warning(f"Executing DQ Rule for {rule.name} using query {rule.query}")
105 | query_count = self.spark.sql(query).count()
106 | return RuleExecutionResult(rule, 'fail' if query_count > 0 else 'pass',
107 | self.total_count - query_count,
108 | query_count, self.total_count)
109 |
110 | def execute_rules(self, df: DataFrame) -> tuple[bool, str]:
111 | """
112 | Executes list of rules (data quality checks) given on dataframe's data
113 | Args:
114 | :param df: Dataframe on which quality checks to be executed
115 | :param rules: List of rules mapped to Rule type
116 |
117 | Returns:
118 | :return: boolean status True if all rules executed successfully without any failures, False otherwise and
119 | HTML report of details executed rules
120 |
121 | Exceptions:
122 | :exception: All exceptions thrown by calling functions called in this function
123 | """
124 | logging.info("Starting data quality rules executions..")
125 | self.execution_results: List[RuleExecutionResult] = []
126 | self.df = df
127 | self.total_count = self.df.count()
128 | for unique_rule in list(filter(lambda r: r.rule_type.__eq__('unique'), self.rules)):
129 | self.execution_results.append(self.execute_unique_rule(unique_rule))
130 |
131 | for not_null_rule in list(filter(lambda r: r.rule_type.__eq__('not null'), self.rules)):
132 | self.execution_results.append(self.execute_not_null_rule(not_null_rule))
133 |
134 | for query_rule in list(filter(lambda r: r.rule_type.__eq__('query'), self.rules)):
135 | self.execution_results.append(self.execute_query_rule(query_rule))
136 |
137 | return False if list(filter(lambda exec_result: exec_result.status.__eq__('fail'), self.execution_results)) \
138 | else True, self.generate_report()
139 |
140 | def generate_report(self):
141 | """
142 | Generates HTML report of result of executed data quality checks
143 |
144 | Args: N/A
145 |
146 | Returns:
147 | :return: self.html_report a HTML report of details about executed DQ checks
148 |
149 | Exceptions:
150 | :exception: All exception thrown by calling functions called in this function
151 | """
152 | logging.info(f"Preparing Data quality rules report for {self.dq_id}")
153 | table_header = ' '.join(list(
154 | map(lambda header: f"{header} ", ["Yarn Application Id", "DQ ID", "Rule ID", "Rule Name",
155 | "Rule type", "Description", "Columns/Query", "Pass Count",
156 | "Fail Count",
157 | "Total Count"])))
158 |
159 | def rules_and_result(result: RuleExecutionResult):
160 | table_data = [self.yarn_id,
161 | self.dq_id,
162 | result.rule.rule_id,
163 | result.rule.name,
164 | result.rule.rule_type,
165 | result.rule.description,
166 | result.rule.columns,
167 | result.pass_count,
168 | result.fail_count,
169 | result.total_count
170 | ]
171 | return ' '.join(list(map(lambda d: f"{d} ", table_data)))
172 |
173 | failed_rules = list(filter(lambda result: result.status.__eq__('fail'), self.execution_results))
174 | failed_details = ' '.join(list(map(lambda result: f"{rules_and_result(result)} ", failed_rules)))
175 | failure_table = f'Failed DQ details ' \
176 | f'{table_header} ' \
177 | f'{failed_details}
' if failed_rules else ""
178 |
179 | passed_rules = list(filter(lambda result: result.status.__eq__('pass'), self.execution_results))
180 | passed_details = ' '.join(list(map(lambda result: f"{rules_and_result(result)} ", passed_rules)))
181 | passed_table = f'Succeeded DQ details ' \
182 | f'{table_header} ' \
183 | f'{passed_details}
' if passed_rules else ""
184 |
185 | opening_statement = "Team, " \
186 | f"Data Quality check finished successfully for DQ ID = {self.dq_id} " \
187 | f"{', with failures. ' if failed_rules else '. '}" \
188 | "Check details in below table of metrics.
"
189 | closing_statement = " " \
190 | f"Executed on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}, " \
191 | "Thanks "
192 |
193 | self.html_report = f"{opening_statement}" \
194 | f"{failure_table if failed_rules else ''}" \
195 | f"{passed_table if passed_rules else ''}" \
196 | f"{closing_statement}"
197 |
198 | return self.html_report
199 |
200 | def write_report_to_html(self, file_name):
201 | """
202 | Writes Data Quality rules execution results to a html file
203 |
204 | Args:
205 | :param file_name: name of file to write report as HTML file
206 |
207 | Returns:
208 | :return: N/A
209 |
210 | Exceptions:
211 | :exception Throws exception if unable to write into html file but will not halt the execution process
212 | """
213 | logging.info(f"Writing data quality execution report to html file {self.execution_reports_dir}/{file_name}")
214 | try:
215 | if not self.execution_reports_dir:
216 | raise Exception("Empty file path")
217 | f = open(self.execution_reports_dir + "/" + file_name, "w")
218 | f.write(self.html_report)
219 | f.close()
220 | except Exception as ex:
221 | log_exception_details(message="Error writing report to html, skipping writing report",
222 | exception_object=ex)
223 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/helpers.py:
--------------------------------------------------------------------------------
1 | import getpass
2 | import json
3 | import logging
4 | import traceback
5 | from pathlib import Path
6 |
7 | import isodate
8 | from isodate import ISO8601Error
9 |
10 |
11 | def create_dir(dir_path):
12 | """
13 | Creates directory from given path
14 | :param dir_path: relative path of directory to create
15 | :return: N/A
16 | """
17 | try:
18 | Path(dir_path).mkdir(parents=True, exist_ok=True)
19 | except Exception as ex:
20 | msg = f"Error creating directory from given relative path {dir_path}"
21 | log_exception_details(message=msg, exception_object=ex)
22 | raise ex
23 |
24 |
25 | def get_user():
26 | """
27 | Fetches username of the executor
28 |
29 | Args:
30 |
31 | Returns:
32 | :return: username of the executor / logged in machine
33 | """
34 | return getpass.getuser()
35 |
36 |
37 | def is_null_or_empty(obj) -> bool:
38 | """
39 | Checks if an object is null or empty if object is of type string
40 |
41 | Args:
42 | :param obj: object / variable to validate
43 |
44 | Returns:
45 | :return: bool True of object is null or string is empty False otherwise
46 | """
47 | if obj is None:
48 | return True
49 | elif type(obj) is str and str(obj).strip().__eq__(''):
50 | return True
51 | else:
52 | return False
53 |
54 |
55 | def get_project_root() -> Path:
56 | """
57 | Identifies project root, Returns project root, the repository root
58 | Args:
59 |
60 | Returns:
61 | :return: project's root path as type Path
62 | """
63 | return Path(__file__).parent.parent.parent.parent.parent
64 |
65 |
66 | def read_json_get_dict(json_path) -> dict:
67 | """
68 | Reads json file from given `json_path` & returns as python dict
69 | Args:
70 | :param :json_path : Absolute or Relative path of json file to read & convert
71 |
72 | Return:
73 | :return :json_as_dict: JSON content as dictionary type
74 | """
75 | try:
76 | with open(json_path, 'r') as stream:
77 | json_as_dict = json.load(stream)
78 | stream.close()
79 | return json_as_dict
80 | except Exception as ex:
81 | log_exception_details(f'Error reading json file {json_path}, error traceback below', ex)
82 |
83 |
84 | def log_exception_details(message, exception_object):
85 | """
86 | Logs the exception to console & log file for every exception
87 |
88 | Args:
89 | :param message: Developer's message on exception
90 | :param exception_object: Class object of the exception
91 |
92 | Returns: N/A
93 | """
94 | logging.error(exception_object.__str__())
95 | logging.error(traceback.format_exc())
96 | logging.exception(message)
97 |
98 |
99 | def convert_iso_to_time_duration(iso_time_duration: str):
100 | """
101 | Converts ISO time duration to time in hours, minutes & seconds
102 |
103 | Args:
104 | :param iso_time_duration: ISO time in string Example: PT1H, PT100M, PT2H5M
105 |
106 | Returns:
107 | :return: Returns duration as datetime.timedelta type.
108 | Example: 01:00:00, 01:40:00, 02:05:00
109 | """
110 | if is_null_or_empty(iso_time_duration):
111 | msg = f'Empty or Invalid time duration string {iso_time_duration}'
112 | logging.error(msg)
113 | return None
114 | try:
115 | return isodate.parse_duration(iso_time_duration)
116 | except ISO8601Error as isoError:
117 | msg = f"Error converting ISO time {iso_time_duration} to timedelta"
118 | log_exception_details(message=msg, exception_object=isoError)
119 | return None
120 |
121 |
122 | def add_iso_time_duration(time1: str, time2: str):
123 | """
124 | Adds two string time duration, first converts to timedelta then adds to return the result
125 | Args:
126 | :param time1: First time as string value
127 | :param time2: Second time as string value
128 |
129 | Returns:
130 | :return: time1 + time2 as datetime.timedelta type
131 | """
132 | if is_null_or_empty(time1) or is_null_or_empty(time2):
133 | msg = f'Empty or Invalid time duration string time1 = {time1}, time2 = {time2}'
134 | logging.error(msg)
135 | return None
136 |
137 | try:
138 | _time1 = convert_iso_to_time_duration(iso_time_duration=time1)
139 | _time2 = convert_iso_to_time_duration(iso_time_duration=time2)
140 | return isodate.duration_isoformat((_time1 + _time2))
141 | except ISO8601Error as isoError:
142 | msg = f"Error converting ISO time time1={time1} & time2={time2} to timedelta"
143 | logging.error(msg)
144 | log_exception_details(message=msg, exception_object=isoError)
145 | return None
146 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/logging_util.py:
--------------------------------------------------------------------------------
1 | import atexit
2 | import logging
3 | from logging.config import ConvertingList, ConvertingDict, valid_ident
4 | from logging.handlers import QueueHandler, QueueListener
5 | from queue import Queue
6 |
7 |
8 | def _resolve_handlers(l):
9 | if not isinstance(l, ConvertingList):
10 | return l
11 | # Indexing the list performs the evaluation.
12 | return [l[i] for i in range(len(l))]
13 |
14 |
15 | def _resolve_queue(q):
16 | if not isinstance(q, ConvertingDict):
17 | return q
18 | if '__resolved_value__' in q:
19 | return q['__resolved_value__']
20 |
21 | cname = q.pop('class')
22 | klass = q.configurator.resolve(cname)
23 | props = q.pop('.', None)
24 | kwargs = {k: q[k] for k in q if valid_ident(k)}
25 | result = klass(**kwargs)
26 | if props:
27 | for name, value in props.items():
28 | setattr(result, name, value)
29 |
30 | q['__resolved_value__'] = result
31 | return result
32 |
33 |
34 | class QueueListenerHandler(QueueHandler):
35 | def __init__(self, handlers, respect_handler_level=False, auto_run=True, queue=Queue(-1)):
36 | queue = _resolve_queue(queue)
37 | super().__init__(queue)
38 | handlers = _resolve_handlers(handlers)
39 | self._listener = QueueListener(
40 | self.queue,
41 | *handlers,
42 | respect_handler_level=respect_handler_level)
43 | if auto_run:
44 | self.start()
45 | atexit.register(self.stop)
46 |
47 | def start(self):
48 | self._listener.start()
49 |
50 | def stop(self):
51 | self._listener.stop()
52 |
53 | # def emit(self, record):
54 | # return super().emit(record)
55 |
56 |
57 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8)
58 |
59 | # These are the sequences need to get colored output
60 | RESET_SEQ = "\033[0m"
61 | COLOR_SEQ = "\033[0;%dm"
62 | BOLD_SEQ = "\033[1m"
63 |
64 | COLORS = {
65 | 'WARNING': YELLOW,
66 | 'INFO': GREEN,
67 | 'DEBUG': MAGENTA,
68 | 'CRITICAL': RED,
69 | 'ERROR': RED
70 | }
71 |
72 |
73 | class ColoredFormatter(logging.Formatter):
74 | def __init__(self, msg, use_color=True):
75 | logging.Formatter.__init__(self, msg)
76 | self.use_color = use_color
77 |
78 | def format(self, record):
79 | if self.use_color and record.levelname in COLORS:
80 | # The background is set with 40 plus the number of the color, and the foreground with 30
81 | record.levelname = COLOR_SEQ % (30 + COLORS[record.levelname]) + record.levelname + RESET_SEQ
82 | return logging.Formatter.format(self, record)
83 |
84 |
85 | def formatter_message(message, use_color=True):
86 | if use_color:
87 | message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ)
88 | else:
89 | message = message.replace("$RESET", "").replace("$BOLD", "")
90 | return message
91 |
92 |
93 | class ColoredLogger(logging.Logger):
94 | # FORMAT = "$BOLD%(name)-20s$RESET][%(levelname)-18s] %(message)s ($BOLD%(filename)s$RESET:%(lineno)d)"
95 | def __init__(self, name):
96 | logging.Logger.__init__(self, name, logging.DEBUG)
97 | self.FORMAT = '%(asctime)s %(name)-15s [$BOLD%(levelname)-10s$RESET] %(process)-10d %(funcName)-30s %(message)s'
98 | self.COLOR_FORMAT = formatter_message(self.FORMAT, True)
99 | color_formatter = ColoredFormatter(self.COLOR_FORMAT)
100 |
101 | console = logging.StreamHandler()
102 | console.setFormatter(color_formatter)
103 |
104 | self.addHandler(console)
105 | return
106 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/spark.py:
--------------------------------------------------------------------------------
1 | import logging
2 | from typing import List
3 |
4 | from pyspark.sql import SparkSession, DataFrame
5 | from pyspark.sql.functions import concat_ws, col, floor, rand
6 | from pyspark.sql.types import StringType
7 |
8 | from com.vitthalmirji.utils.helpers import log_exception_details, is_null_or_empty
9 |
10 |
11 | def get_or_create_spark_session(need_hive_support: bool = False,
12 | spark_conf: List[dict] = [{'key': 'spark.app.name', 'value': ''}]) -> SparkSession:
13 | """
14 | Creates a spark session with given configuration in parameters
15 |
16 | Args:
17 | :param application_name: Name of the spark application
18 | :param spark_conf: Specific Spark Configurations at user level (default is None)
19 | :param need_hive_support: Enable Hive support in spark session? (default is False)
20 |
21 | Returns:
22 | An object of SparkSession
23 |
24 | Exceptions:
25 | Throws any exception on to calling function that has encountered during creating SparkSession
26 | :exception type of exception is broader, this can be improvised to handle more specific exceptions
27 | """
28 | spark: SparkSession
29 | try:
30 | spark: SparkSession = SparkSession.getActiveSession()
31 | if spark:
32 | logging.warning("Returning active spark session")
33 | return spark
34 |
35 | logging.warning(f"Creating spark session first time with configs {spark_conf}")
36 |
37 | if need_hive_support:
38 | spark = SparkSession.builder \
39 | .enableHiveSupport() \
40 | .getOrCreate()
41 | else:
42 | spark = SparkSession.builder \
43 | .getOrCreate()
44 |
45 | for conf in list(spark_conf):
46 | spark.conf.set(**conf)
47 |
48 | logging.warning(f"Executor cores = {spark.conf.get('spark.executor.cores', 'Not set')}")
49 | logging.warning(f"Num Executors = {spark.conf.get('spark.executor.instances', 'Not set')}")
50 | return spark
51 | except Exception as ex:
52 | log_exception_details(message="Error creating spark session", exception_object=ex)
53 | raise ex
54 |
55 |
56 | def read_data_as_spark_dataframe(filetype: str, location: str, options={}, table_name=None) -> DataFrame:
57 | """
58 | Reads various kind of files & tables in spark
59 | Args:
60 | :param filetype:
61 | :param location:
62 | :param options:
63 | :param table_name:
64 |
65 | Returns:
66 | :return: A DataFrame object
67 |
68 | Exception:
69 | Throws any exception that is encountered during file / table read in spark
70 | :exception type of exception is broader, this can be improvised to handle more specific exceptions
71 | """
72 | logging.warning(f"Attempting to read {filetype} in spark using configs {options} from location {location}")
73 | spark = get_or_create_spark_session()
74 | try:
75 | if str(filetype).lower().__eq__('table'):
76 | if is_null_or_empty(table_name) is not None:
77 | try:
78 | _ = spark.read.options(**options).table(table_name)
79 | except Exception as ex:
80 | log_exception_details(message=f"Error reading table {table_name}", exception_object=ex)
81 | raise ex
82 | else:
83 | print(f"Invalid table {table_name} -Table do not exist in SQL Context: ")
84 | elif str(filetype).lower().__eq__('text'):
85 | logging.warning(
86 | "Lines will be read from the text file and dataframe will have single column by name 'line'")
87 | return spark.read.options(**options).text(paths=location).toDF('line')
88 | elif str(filetype).lower().__eq__('csv'):
89 | return spark.read.options(**options).csv(path=location)
90 | elif str(filetype).lower().__eq__('xml'):
91 | return spark.read.format('com.databricks.spark.xml').options(**options).load(path=location)
92 | elif str(filetype).lower().__eq__('json'):
93 | return spark.read.options(**options).json(path=location)
94 | elif str(filetype).lower().__eq__('orc'):
95 | return spark.read.options(**options).orc(location)
96 | elif str(filetype).lower().__eq__('parquet'):
97 | return spark.read.options(**options).parquet(location)
98 | else:
99 | raise Exception(f"Invalid filetype: {filetype}")
100 | except Exception as ex:
101 | log_exception_details(message=f"Error reading file in Spark of filetype {filetype}", exception_object=ex)
102 | raise ex
103 |
104 |
105 | def revise_shuffle_partitions(multiplier: int = 1):
106 | """
107 | Sets the shuffle partition to total number of cores across all executors
108 | Useful in dataframe operations using spark
109 | :param multiplier: In case of stage failures increase the multiplier
110 | :return: N/A
111 | """
112 | spark = get_or_create_spark_session()
113 | num_executors = int(spark.conf.get('spark.executor.instances', '2').strip())
114 | num_cores = int(spark.conf.get('spark.executors.cores', '1').strip())
115 | revised_shuffle_partition = num_executors * num_cores * multiplier
116 | spark.conf.set('spark.sql.shuffle.partitions', f"{revised_shuffle_partition}")
117 |
118 |
119 | def data_frame_repartition(df: DataFrame, num_files: int = None, use_coalesce=False, repartition_columns=None):
120 | """
121 | Function to repartition data for better performance.
122 | Majorly has 2 types: #1 - coalesce: to narrow down files in output; #2 - repartition: to uniformly distribute data in output
123 | Note: This involves shuffling (wide transformation)
124 | Args:
125 | :param df: Dataframe on which repartition (wide transformation) to be performed
126 | :param num_files: Number of output files required
127 | :param use_coalesce: Use this to narrow down the number of files irrespective of any columns default is False
128 | :param repartition_columns: Columns on which repartition to be performed
129 | Most important note: Columns specified here must & should be low cardinality values in table
130 | Returns:
131 | :return: Dataframe with repartition or coalesce transformation applied
132 | """
133 | if use_coalesce:
134 | return df.coalesce(num_files)
135 |
136 | columns_list = list(map(lambda column: col(column).cast(StringType()),
137 | repartition_columns)) if repartition_columns is not None else []
138 |
139 | if num_files is None and len(columns_list) > 0:
140 | return df.repartition(*columns_list)
141 |
142 | salting_column = floor(rand() * num_files)
143 | temp_repartition_column = 'temp_repartition_column'
144 | return df.withColumn(
145 | temp_repartition_column,
146 | concat_ws('~', *columns_list, salting_column)
147 | ).repartition(temp_repartition_column).drop(temp_repartition_column)
148 |
149 |
150 | def standardize_and_rename_df_columns(df: DataFrame, column_names_to_rename: dict):
151 | """
152 | Performs renaming column names on given dataframe:
153 | Trims if column name has leading & trailing whitespaces
154 | For given dictionary of columns renames according to specified name
155 | Args:
156 | :param df: DataFrame for renaming columns
157 | :param column_names_to_rename: dictionary having existing column name & revised / renaming column name
158 |
159 | Returns:
160 | :return: _df transformed dataframe with column names renamed
161 |
162 | Exceptions:
163 | :exception Throws exception that's encountered during renaming column on dataframe
164 | """
165 | _df = df
166 | try:
167 | # Trim and lowercase all column names
168 | for column_name in filter(lambda c: not column_names_to_rename.keys().__contains__(c), df.columns):
169 | _df = _df.withColumnRenamed(column_name, column_name.strip().lower())
170 |
171 | for column_name, revised_column_name in column_names_to_rename.items():
172 | _df = _df.withColumnRenamed(column_name, revised_column_name)
173 | return _df
174 | except Exception as ex:
175 | log_exception_details(message=f"Error renaming columns on given dataframe {column_names_to_rename}",
176 | exception_object=ex)
177 | raise ex
178 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/spark_submit_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import logging.config
3 | from datetime import datetime
4 |
5 | from com.vitthalmirji.utils.Utilities import get_dates_between_range
6 |
7 | START_TIME = datetime.now().isoformat().__str__()
8 |
9 |
10 | class StaticConfigParameterNotFound(Exception):
11 | pass
12 |
13 |
14 | def sort_spark_submit_options(command_options):
15 | sorted_command_options = sorted(command_options, key=lambda k: k[0])
16 | return sorted_command_options
17 |
18 |
19 | def update_conf(command_options):
20 | _conf = {k: v for k, v in command_options.items() if k == '--conf'}['--conf']['value']
21 | _conf = get_spark_conf_key_value_as_string(_conf)
22 | _conf = f"\"{_conf}\""
23 | command_options['--conf'].update({'value': _conf})
24 |
25 | return command_options
26 |
27 |
28 | def get_class_arguments_as_string(command):
29 | return ' \\\n'.join(list(map(lambda c: f"{c}={command['--class_arguments']['value'][c]}",
30 | command['--class_arguments']['value'])))
31 |
32 |
33 | def get_spark_conf_key_value_as_string(conf):
34 | return f"""{",".join([f"{d}={conf[d]}" for d in conf])}\""""
35 |
36 |
37 | def static_config_args_sanity_check(command, config):
38 | for cmd in command:
39 | if config.get(cmd) is None and command[cmd]['required'] is True:
40 | logging.error(f"Configuration file do not have required spark-submit option {cmd}")
41 | raise StaticConfigParameterNotFound(
42 | f"ERROR: Configuration file do not have required spark-submit option {cmd}")
43 | elif config.get(cmd) is not None:
44 | command[cmd].update({'value': config.get(cmd)})
45 | else:
46 | continue
47 | return command
48 |
49 |
50 | def update_spark_submit_option_values(runtime_args, config_args, command):
51 | config_args['default']['--conf'].update(config_args[runtime_args['workflow']]['spark_conf'])
52 | config_args['default']['--name'] = f"\"{runtime_args['workflow']}\""
53 | command['--class_arguments']['value'].update(runtime_args)
54 | return config_args, command
55 |
56 |
57 | def prepare_spark_submit(runtime_args, config_args, app_config):
58 | command = app_config['spark_submit_options_order']
59 | _config_args, command = update_spark_submit_option_values(runtime_args, config_args, command)
60 | _config_args = _config_args['default']
61 |
62 | command = static_config_args_sanity_check(command, _config_args)
63 |
64 | command_date_ranges = get_dates_between_range(refresh_type=runtime_args['refreshType'],
65 | start_date=runtime_args['startDate'],
66 | end_date=runtime_args['endDate'],
67 | interval_in_days=app_config['default_settings'][
68 | 'history_load_interval_in_days'],
69 | date_pattern='%Y-%m-%d')
70 | logging.debug(f"Date Range = {command_date_ranges}")
71 | command = update_conf(command_options=command)
72 | spark_submit_command = ' \\\n'.join(f"{k} {v['value']}" for k, v in command.items() if k != '--class_arguments')
73 |
74 | logging.debug(command)
75 | command_list = []
76 | for d in command_date_ranges:
77 | command['--class_arguments']['value'].update(d)
78 | class_args = get_class_arguments_as_string(command)
79 | command_list.append(f"{spark_submit_command}\n{class_args}")
80 | return command_list
81 |
--------------------------------------------------------------------------------
/src/com/vitthalmirji/utils/transformation_extension.py:
--------------------------------------------------------------------------------
1 | from pyspark.rdd import RDD
2 | from pyspark.sql.dataframe import DataFrame
3 |
4 |
5 | def transform(self, f):
6 | return f(self)
7 |
8 |
9 | RDD.transform = transform
10 | DataFrame.transform = transform
11 |
--------------------------------------------------------------------------------
/tests/EtlTransformTest.py:
--------------------------------------------------------------------------------
1 | import time
2 | import unittest
3 |
4 | from etl import Transform
5 | from etl.meta import MetaModel
6 | from utils.Utilities import SparkSettings
7 |
8 | start_time = time.time()
9 |
10 |
11 | class MyTestCase(unittest.TestCase):
12 | def testEtlTransformations(self):
13 | self.spark = SparkSettings("EtlTransformTest").getSparkSession()
14 | metamodel = MetaModel(datamodelpath='resources/datamodel.csv', sc=self.spark)
15 |
16 | # print(f"Data model as JSON -> \n{metamodel.datamodel}")
17 |
18 | metamodel.readMetadataFromCsv(sc=self.spark, metadatapath='resources/meta.csv', targettable='invoice')
19 | metamodel.readSourceFilesIntoDF()
20 |
21 | targetddl = metamodel.getTargetDdl('PARQUET', True)
22 | # print('------Target DDL ------')
23 | # print('------Target Query ------')
24 | # print(f"{queryhead} {querytail}")
25 |
26 | # self.spark.sql(f"{queryhead} {querytail}").show()
27 |
28 | trans = Transform(targettable='invoice', model=metamodel, sc=self.spark)
29 |
30 | trans.transform()
31 | self.assertIsNotNone(trans)
32 |
33 |
34 | if __name__ == '__main__':
35 | unittest.main()
36 |
--------------------------------------------------------------------------------
/tests/UtilsTest.py:
--------------------------------------------------------------------------------
1 | import getpass
2 | import unittest
3 |
4 | from utils.Utilities import create_spark_session, count_words, split_words
5 |
6 |
7 | class UtilsTest(unittest.TestCase):
8 |
9 | def test1_testSparkSettings(self):
10 | print("Testing Spark Settings")
11 | self.spark = create_spark_session(application_name="Utils Test")
12 | self.assertEqual(str(self.spark.version), "2.4.5")
13 | self.assertEqual(str(self.spark.sparkContext.sparkUser()), getpass.getuser())
14 |
15 | metadf: DataFrame = self.spark.read.option("header", "true").format("csv").load(path="resources/meta.csv")
16 |
17 | self.assertEqual(True, True)
18 |
19 | def test2_test_custom_transformations(self):
20 | print("Testing Environment")
21 | self.spark = create_spark_session(application_name="Utils Test")
22 | line_array = ["Hello,World,How,are,you", "Hello.World.How.are.you", "Hello;World;How;are;you",
23 | "Hello-World-How-are-you", "Hello|World|How|are|you", "Hello World How are you"]
24 |
25 | lines_rdd: RDD[str] = self.spark.sparkContext.parallelize(line_array)
26 | df = lines_rdd.transform(lambda _rdd: split_words(_rdd)).transform(lambda _rdd: count_words(_rdd))
27 | df.toDF().toDF("Word", "Count").show()
28 |
29 | self.assertTrue(df is not None)
30 | self.assertEqual(df.count(), 5)
31 |
32 |
33 | if __name__ == '__main__':
34 | unittest.main()
35 |
--------------------------------------------------------------------------------
/tests/XmlMapperTest.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from com.vitthalmirji.imports.HdfsImport import HdfsImport
4 | from com.vitthalmirji.mapper.Mapper import ComplexDataMapper
5 |
6 |
7 | class XmlMapperTest(unittest.TestCase):
8 | def test_create_hive_ql_for_nested_data_explode(self):
9 | print("Testing HdfsImport readFromSource")
10 | self.sparksettings = SparkSettings("XmlMapperTest")
11 | self.spark = self.sparksettings.getSparkSession()
12 | self.hdfsImport = HdfsImport(self.spark)
13 |
14 | # Read JSON file from given path
15 | json_df = self.spark.read.json(path='resources/clinical_trial/*.xml')
16 |
17 | json_df.printSchema()
18 |
19 | # Register as temporary view / table for flattening queries to execute on
20 | json_df.createOrReplaceTempView('jsontable')
21 |
22 | # self.spark.range(10).select(monotonically_increasing_id()).show()
23 | # self.spark.range(10).select(monotonically_increasing_id()).coalesce(1).show()
24 | # self.spark.range(10).repartition(5).select(monotonically_increasing_id()).coalesce(1).show()
25 |
26 | # Create an object of class XmlMapper from Mapper.py by passing spark variable
27 | xml_mapper: ComplexDataMapper = ComplexDataMapper(sc=self.spark)
28 |
29 | # Call createViews function by passing json_df dataframe, it returns 2 things flattening queries and XPATH (
30 | # Only for XML; Ignore for JSON)
31 | view_queries = xml_mapper.createViews(df=json_df, root_table_name='jsontable',
32 | columns_cascade_to_leaf_level_with_alias=[
33 | 'item.organizationId AS pk_organizationId'])
34 |
35 | # Loop through all queries, execute them, physicalize flattened attributes as table - Repeat steps to all
36 | # queries (Nested attributes)
37 | for q in view_queries[0]:
38 | print(f'{q}:' f'{view_queries[0][q]}')
39 | temp_df = self.spark.sql(view_queries[0][q])
40 | temp_df.rdd.zipWithUniqueId().toDF().printSchema()
41 | temp_df.createOrReplaceTempView(q)
42 | select_cols = []
43 | for col in temp_df.schema.fields:
44 | if not str(col.dataType).lower().startswith("struct") and not str(col.dataType).lower().startswith(
45 | "array"):
46 | select_cols.append(col.name)
47 | print(f"Total partitions = {temp_df.rdd.getNumPartitions()}")
48 | temp_df.select(select_cols).show()
49 |
50 |
51 | if __name__ == '__main__':
52 | unittest.main()
53 |
--------------------------------------------------------------------------------
/tests/aws_test/AwsS3Test.py:
--------------------------------------------------------------------------------
1 | import os
2 | import signal
3 | import subprocess
4 | import unittest
5 |
6 | import boto3
7 | from pyspark.sql.dataframe import DataFrame
8 | from pyspark.sql.session import SparkSession
9 |
10 | from utils.Utilities import list_s3_files
11 |
12 |
13 | class AwsS3Test(unittest.TestCase):
14 | @classmethod
15 | def setUpClass(cls) -> None:
16 | # create an s3 connection that points to the moto server.
17 | cls.s3_resource_obj = boto3.resource(
18 | "s3",
19 | endpoint_url="http://127.0.0.1:5000"
20 | )
21 |
22 | cls.s3_client_obj = boto3.client(
23 | "s3",
24 | endpoint_url="http://127.0.0.1:5000"
25 | )
26 | # start moto server, by default it runs on localhost on port 5000.
27 | cls.process = subprocess.Popen(
28 | ['moto_server', 's3'],
29 | stdout=subprocess.PIPE,
30 | shell=True,
31 | creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
32 | )
33 |
34 | # create an S3 bucket.
35 | cls.s3_resource_obj.create_bucket(Bucket="bucket")
36 |
37 | # # configure pyspark to use hadoop-aws module. os.environ[ "PYSPARK_SUBMIT_ARGS" ] = '--packages
38 | # "org.apache.hadoop:hadoop-aws:2.7.3" --packages "org.apache.httpcomponents:httpclient:4.2.5" ' \
39 | # '--packages "org.xerial.snappy:snappy-java:1.1.7.3" pyspark-shell '
40 |
41 | # get the spark session object and hadoop configuration.
42 | cls.spark: SparkSession = SparkSession.builder.getOrCreate()
43 | cls.hadoop_conf = cls.spark.sparkContext._jsc.hadoopConfiguration()
44 | # mock the aws credentials to access s3.
45 | cls.hadoop_conf.set("fs.s3a.access.key", "dummy-value")
46 | cls.hadoop_conf.set("fs.s3a.secret.key", "dummy-value")
47 | # we point s3a to our moto server.
48 | cls.hadoop_conf.set("fs.s3a.endpoint", "http://127.0.0.1:5000")
49 | # we need to configure hadoop to use s3a.
50 | cls.hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
51 |
52 | @classmethod
53 | def test_dataframe_operation_s3(cls):
54 | # create a pyspark dataframe.
55 | values = [("k1", 1), ("k2", 2)]
56 | columns = ["key", "value"]
57 | df = cls.spark.createDataFrame(values, columns)
58 | # write the dataframe as csv to s3.
59 | df.write.mode('overwrite').csv("s3://bucket/source.csv")
60 | # read the dataset from s3
61 | df = cls.spark.read.csv("s3://bucket/source.csv")
62 | # print Data
63 | df.show()
64 | # assert df is a DataFrame
65 | assert isinstance(df, DataFrame)
66 |
67 | print("test_s3_glue_jobs_locally successfully completed")
68 |
69 | @classmethod
70 | def test_3_create_directory_files_s3(cls):
71 | some_binary_data = b'Here we have some data'
72 |
73 | cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1" + '/'))
74 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/dir1.txt')
75 |
76 | cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1/subdir1" + '/'))
77 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/subdir1/dir1_subdir1.txt')
78 |
79 | cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1/subdir2" + '/'))
80 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/subdi2/dir1_subdir2.txt')
81 |
82 | cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/")
83 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/dir2.txt')
84 |
85 | cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/subdir1/")
86 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/subdir1/dir2_subdir1.txt')
87 |
88 | cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/subdir2/")
89 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/subdir2/dir2_subdir2.txt')
90 |
91 | contents = list_s3_files(opt={'Bucket': 'bucket'})
92 | print(contents)
93 | contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True)
94 | print(contents)
95 | contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True,
96 | file_extension='.csv')
97 | print(contents)
98 | contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True,
99 | file_extension='.xml')
100 | print(contents)
101 |
102 | @classmethod
103 | def tearDownClass(cls) -> None:
104 | # shut down the moto server.
105 | os.kill(cls.process.pid, signal.SIGTERM)
106 |
107 |
108 | if __name__ == '__main__':
109 | unittest.main()
110 |
--------------------------------------------------------------------------------
/tests/aws_test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/tests/aws_test/__init__.py
--------------------------------------------------------------------------------
/tests/aws_test/glue_job.py:
--------------------------------------------------------------------------------
1 | import sys
2 |
3 | from pyspark.context import SparkContext
4 |
5 |
6 | # https://github.com/aws-samples/aws-glue-samples/tree/master/examples
7 |
8 | def run(cli_args, spark):
9 | # init glue pyspark job
10 | glue_args = _get_glue_args(cli_args=cli_args)
11 | spark_session, job = _get_spark_session_and_glue_job(glue_args)
12 |
13 | # run glue job code
14 | source = cli_args["source"]
15 | destination = cli_args["destination"]
16 | df = spark.read.csv(source)
17 | df.write.csv(destination)
18 |
19 | # commit job
20 | _commit_job(job)
21 |
22 |
23 | def _get_spark_session_and_glue_job(glue_args):
24 | from awsglue.context import GlueContext
25 | from awsglue.job import Job
26 |
27 | sc = SparkContext.getOrCreate()
28 | glue_context = GlueContext(sparkContext=sc)
29 | job = Job(glue_context=glue_context)
30 | job.init(glue_args["JOB_NAME"], glue_args)
31 | return glue_context.spark_session, job
32 |
33 |
34 | def _commit_job(job):
35 | job.commit()
36 |
37 |
38 | def _get_glue_args(cli_args):
39 | from awsglue.utils import getResolvedOptions
40 | glue_args = getResolvedOptions(args=cli_args, options=["JOB_NAME", "source", "destination"])
41 | print(glue_args)
42 | return glue_args
43 |
44 |
45 | if __name__ == "__main__":
46 | run(["source", "destination"])
47 |
--------------------------------------------------------------------------------
/tests/aws_test/test_glue_job.py:
--------------------------------------------------------------------------------
1 | import os
2 | import signal
3 | import subprocess
4 | import unittest
5 | from unittest import mock
6 |
7 | import boto3
8 | from pyspark.sql import SparkSession
9 |
10 | from aws_test import glue_job
11 | from utils.Utilities import delete_s3_bucket
12 |
13 |
14 | class TestGlueJob(unittest.TestCase):
15 | """
16 | This test class setup a test environment to test our glue job,
17 | runs the glue job and checks the result.
18 | """
19 |
20 | @classmethod
21 | def setUpClass(cls):
22 | """
23 | the setup class starts a moto server, creates an S3 bucket,
24 | configures PySpark and Spark and dumps the source dataframe to S3.
25 | """
26 | S3_MOCK_ENDPOINT = "http://127.0.0.1:5000"
27 |
28 | # setup moto server
29 | # cls.process = subprocess.Popen(
30 | # "moto_server s3", stdout=subprocess.PIPE,
31 | # shell=True, preexec_fn=os.setsid()
32 | # )
33 |
34 | os.environ['AWS_ACCESS_KEY_ID'] = 'test'
35 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'test'
36 |
37 | cls.process = subprocess.Popen(
38 | ['moto_server', 's3'],
39 | stdout=subprocess.PIPE,
40 | shell=True,
41 | creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
42 | )
43 |
44 | # create s3 connection, bucket and s3 url's
45 | cls.s3_conn = boto3.resource(
46 | "s3", region_name="eu-central-1",
47 | endpoint_url=S3_MOCK_ENDPOINT
48 | )
49 | bucket = "bucket"
50 | delete_s3_bucket(bucket)
51 | cls.s3_conn.create_bucket(Bucket=bucket)
52 | cls.s3_source = "s3://{}/{}".format(bucket, "source.csv")
53 | cls.s3_destination = "s3://{}/{}".format(bucket, "destination.csv")
54 |
55 | # Setup spark to use s3, and point it to the moto server.
56 | os.environ[
57 | "PYSPARK_SUBMIT_ARGS"
58 | ] = """--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell"""
59 | cls.spark = SparkSession.builder.getOrCreate()
60 | hadoop_conf = cls.spark.sparkContext._jsc.hadoopConfiguration()
61 | hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
62 | hadoop_conf.set("fs.s3a.access.key", "mock")
63 | hadoop_conf.set("fs.s3a.secret.key", "mock")
64 | hadoop_conf.set("fs.s3a.endpoint", S3_MOCK_ENDPOINT)
65 |
66 | # create source dataframe and write the dataframe as csv to s3
67 | values = [("k1", 1), ("k2", 2)]
68 | columns = ["key", "value"]
69 | df = cls.spark.createDataFrame(values, columns)
70 | df.write.csv(cls.s3_source)
71 |
72 | @mock.patch("glue_job._commit_job")
73 | @mock.patch("glue_job._get_glue_args")
74 | @mock.patch("glue_job._get_spark_session_and_glue_job")
75 | def test_glue_job_runs_successfully(self, m_session_job, m_get_glue_args, m_commit):
76 | """
77 | we arrange our test function; construct the arguments that we get from the cli, set the return
78 | values of our mocked functions.
79 | we run our glue job and assert if the result is what we expect.
80 | """
81 | # arrange
82 | cli_args = {"--JOBNAME": 'TestGlueLocal', "--source": self.s3_source, "--destination": self.s3_destination}
83 |
84 | m_session_job.return_value = self.spark, None
85 | m_get_glue_args.return_value = cli_args
86 |
87 | # act
88 | glue_job.run(cli_args=cli_args, spark=self.spark)
89 |
90 | # assert
91 | df = self.spark.read.csv(self.s3_destination)
92 | self.assertTrue(not df.rdd.isEmpty())
93 |
94 | @classmethod
95 | def tearDownClass(cls):
96 | # shut down moto server
97 | os.killpg(os.getpgid(cls.process.pid), signal.SIGTERM)
98 |
99 |
100 | if __name__ == "__main__":
101 | try:
102 | unittest.main()
103 | except Exception:
104 | TestGlueJob().tearDownClass()
105 |
--------------------------------------------------------------------------------
/tests/aws_test/test_mocked_postgres.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import sqlalchemy
4 | from testcontainers.postgres import PostgresContainer
5 |
6 |
7 | class MockedPostgresTest(unittest.TestCase):
8 | @classmethod
9 | def test_docker_run_postgress(cls) -> None:
10 | postgres_container = PostgresContainer("postgres:9.5")
11 | with postgres_container as postgres:
12 | e = sqlalchemy.create_engine(postgres.get_connection_url())
13 | result = e.execute("SELECT version()")
14 |
15 | @classmethod
16 | def tearDownClass(cls) -> None:
17 | print('Done')
18 |
19 |
20 | if __name__ == '__main__':
21 | try:
22 | unittest.main()
23 | except Exception:
24 | MockedPostgresTest().tearDownClass()
25 |
--------------------------------------------------------------------------------
/tests/aws_test/testing_mocked_s3.py:
--------------------------------------------------------------------------------
1 | import os
2 | import signal
3 | import subprocess
4 | import unittest
5 |
6 | import boto3
7 | from pyspark.sql import DataFrame
8 | from pyspark.sql import SparkSession
9 |
10 |
11 | class MockTestGlueJob(unittest.TestCase):
12 | # start moto server, by default it runs on localhost on port 5000.
13 | process = subprocess.Popen(
14 | ['moto_server', 's3'],
15 | stdout=subprocess.PIPE,
16 | shell=True,
17 | creationflags=subprocess.CREATE_NEW_PROCESS_GROUP
18 | )
19 |
20 | @classmethod
21 | def setUpClass(cls) -> None:
22 | # create an s3 connection that points to the moto server.
23 | s3_conn = boto3.resource(
24 | "s3", endpoint_url="http://127.0.0.1:5000"
25 | )
26 | # create an S3 bucket.
27 | s3_conn.create_bucket(Bucket="bucket")
28 | # # configure pyspark to use hadoop-aws module. os.environ[ "PYSPARK_SUBMIT_ARGS" ] = '--packages
29 | # "org.apache.hadoop:hadoop-aws:2.7.3" --packages "org.apache.httpcomponents:httpclient:4.2.5" ' \
30 | # '--packages "org.xerial.snappy:snappy-java:1.1.7.3" pyspark-shell '
31 |
32 | def test_s3_glue_jobs_locally(self):
33 | # get the spark session object and hadoop configuration.
34 | spark = SparkSession.builder.getOrCreate()
35 | hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration()
36 | # mock the aws credentials to access s3.
37 | hadoop_conf.set("fs.s3a.access.key", "dummy-value")
38 | hadoop_conf.set("fs.s3a.secret.key", "dummy-value")
39 | # we point s3a to our moto server.
40 | hadoop_conf.set("fs.s3a.endpoint", "http://127.0.0.1:5000")
41 | # we need to configure hadoop to use s3a.
42 | hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
43 | # create a pyspark dataframe.
44 | values = [("k1", 1), ("k2", 2)]
45 | columns = ["key", "value"]
46 | df = spark.createDataFrame(values, columns)
47 | # write the dataframe as csv to s3.
48 | df.write.mode('overwrite').csv("s3://bucket/source.csv")
49 | # read the dataset from s3
50 | df = spark.read.csv("s3://bucket/source.csv")
51 | # print Data
52 | df.show()
53 | # assert df is a DataFrame
54 | assert isinstance(df, DataFrame)
55 | print("test_s3_glue_jobs_locally successfully completed")
56 |
57 | @classmethod
58 | def tearDownClass(cls) -> None:
59 | # shut down the moto server.
60 | os.kill(cls.process.pid, signal.SIGTERM)
61 |
62 |
63 | if __name__ == "__main__":
64 | try:
65 | unittest.main()
66 | except Exception:
67 | MockTestGlueJob().tearDownClass()
68 |
--------------------------------------------------------------------------------
/tests/resources/config.yml:
--------------------------------------------------------------------------------
1 | # required to connect to redshift
2 | host: my.redshift.cluster.com
3 | port: 5439
4 | database: db
5 | user: userid
6 | password: password
7 | ## optional extras for the dbapi connector
8 | sslmode: require
9 | another_option: 123
--------------------------------------------------------------------------------
/tests/resources/datamodel.csv:
--------------------------------------------------------------------------------
1 | table ,pk ,fk_table,fk_col ,fk_table_jointype
2 | xmltable ,_id , , ,
3 | carbank_xmltable,_secid ,xmltable,_id ,
4 | species_xmltable,col ,xmltable,_id ,
5 | red_xmltable ,MedlineID,xmltable,_id ,
6 | product ,id ,purchase,productid,LEFT
7 | purchase ,id , , ,
8 | store ,id ,purchase,storeid ,INNER
9 |
--------------------------------------------------------------------------------
/tests/resources/meta.csv:
--------------------------------------------------------------------------------
1 | key ,src_system,source_desc,src_database,src_table ,src_filetype,src_file_path ,src_table_description,src_col ,src_col_description,src_col_datatype,key_constraints,src_col_filter,src_col_aggregator,src_col_aggregator_filter,check_column,mode,udf ,udfarguments,target_database,src_table_order,target_col ,target_col_filter,target_col_aggregator,target_col_aggregator_filter,target_table ,target_file_path,target_col_datatype,access_limitation,nullable,comment
2 | carbank_xmltable-_sec_id , , , ,carbank_xmltable,tbl , , ,_sec_id , ,string , , , , , , , , , ,0 ,sec_id , , , ,transformxmltable, ,string , , ,
3 | xmltable-_id , , , ,xmltable ,xml , , ,_id , ,string , , , , , , , , , ,0 ,id , , , ,transformxmltable, ,string , , ,
4 | xmltable-_mtype , , , ,xmltable ,xml , , ,_mtype , ,string , , , , , , , , , ,0 ,mtype , , , ,transformxmltable, ,string , , ,
5 | xmltable-_seqlen , , , ,xmltable ,xml , , ,_seqlen , ,bigint , , , , , , , , , ,0 ,seqlen , , , ,transformxmltable, ,bigint , , ,
6 | species_xmltable-col , , , ,species_xmltable,tbl , , ,col , ,string , , , , , , , , , ,0 ,species , , , ,transformxmltable, ,string , , ,
7 | red_xmltable-MedlineID , , , ,red_xmltable ,tbl , , ,MedlineID , ,bigint , , , , , , , , , ,0 ,MedlineID , , , ,transformxmltable, ,bigint , , ,
8 | product-name , , , ,product ,csv ,resources/product.csv , ,name , ,string , , , , , , ,nvl ,- , ,0 ,name , , , ,invoice , ,string , , ,
9 | purchase-purchasedate , , , ,purchase ,csv ,resources/purchase.csv, ,purchasedate , ,string ,pk , , , , , ,nvl ,1/1/1900 , ,0 ,purchasedate , , , ,invoice , ,string , , ,
10 | store-name , , , ,store ,csv ,resources/store.csv , ,name , ,string , ,eq('Dadar') , , , , ,nvl ,- , ,0 ,storename , , , ,invoice , ,string , , ,
11 | product-name , , , ,product ,csv ,resources/product.csv , ,name , ,string , , , , , , ,nvl ,- , ,1 ,name , , , ,salesummary , ,string , , ,
12 | purchase-id , , , ,purchase ,csv ,resources/purchase.csv, ,id , ,string , , , , , , ,count, , ,1 ,totalsolditems , , , ,salesummary , ,string , , ,
13 | store-name , , , ,store ,csv ,resources/store.csv , ,name , ,string , , , , , , ,nvl ,- , ,1 ,storename , , , ,salesummary , ,string , , ,
14 | salesummary-storename , , , ,salesummary , , , ,storename , , , , , , , , , , , ,0 ,productname , , , ,salesummary , ,string , , ,
15 | salesummary-name , , , ,salesummary , , , ,name , , , , , , , , , , , ,0 ,totalsoldproducts, , , ,salesummary , ,string , , ,
16 | salesummary-totalsolditems, , , ,salesummary , , , ,totalsolditems, , , , , , , , , , , ,0 ,storename , , , ,salesummary , ,string , , ,
17 |
--------------------------------------------------------------------------------
/tests/resources/mock_dataframe.txt:
--------------------------------------------------------------------------------
1 | a,b,c
2 | 1,x,2011-01-01
3 | 2,y,2001-04-02
--------------------------------------------------------------------------------
/tests/resources/product.csv:
--------------------------------------------------------------------------------
1 | id,name ,price
2 | 1 ,Wrist Watch,10
3 | 2 ,Shoes ,8
4 | 3 ,Tshirt ,5
5 | 4 ,Jeans ,7
6 | 5 ,Sunglasses ,7
7 |
--------------------------------------------------------------------------------
/tests/resources/purchase.csv:
--------------------------------------------------------------------------------
1 | id ,productid,purchasedate,storeid
2 | 100,1 ,10/11/2019 ,1000
3 | 101,3 ,10/12/2019 ,1002
4 | 102,1 , ,1004
5 | 103,1 ,10/14/2019 ,1004
6 | 104,4 ,10/15/2019 ,1003
7 | 105,4 ,10/16/2019 ,1002
8 |
--------------------------------------------------------------------------------
/tests/resources/store.csv:
--------------------------------------------------------------------------------
1 | id ,name
2 | 1000,Borivili
3 | 1001,Kandivili
4 | 1002,Andheri
5 | 1003,Bandra
6 | 1004,Dadar
7 | 1005,Byculla
8 |
--------------------------------------------------------------------------------
/tests/test_comprehensive_logging.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import unittest
3 | import logging.config
4 |
5 | from com.hellofresh.utils.comprehensive_logging import init_logging
6 |
7 |
8 | class LoggingTestCases(unittest.TestCase):
9 | def test_init_logging(self):
10 | init_logging(job_name='Unit tests')
11 | logger = logging.getLogger('root')
12 | self.assertEqual(logger.level, 10)
13 |
14 |
15 | if __name__ == '__main__':
16 | unittest.main()
17 |
--------------------------------------------------------------------------------
/tests/test_data_quality.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import unittest
3 | from pathlib import Path
4 |
5 | from com.hellofresh.utils.data_quality import Rule, RuleExecutionResult, DataQuality
6 | from com.hellofresh.utils.helpers import get_project_root, read_json_get_dict
7 | from com.hellofresh.utils.spark import get_or_create_spark_session
8 |
9 |
10 | class DataQualityTestCases(unittest.TestCase):
11 | def test_Rule(self):
12 | rule_dict = {
13 | "rule_id": 1011,
14 | "name": "Primary / Natural Keys",
15 | "description": "Primary / Natural Keys should not have duplicates",
16 | "rule_type": "unique",
17 | "columns": [
18 | "name"
19 | ]
20 | }
21 | rule = Rule(**rule_dict)
22 | self.assertEqual(rule.rule_id, 1011)
23 | self.assertEqual(rule.name, "Primary / Natural Keys")
24 |
25 | def test_RuleExecutionResult(self):
26 | rule_dict = {
27 | "rule_id": 1011,
28 | "name": "Primary / Natural Keys",
29 | "description": "Primary / Natural Keys should not have duplicates",
30 | "rule_type": "unique",
31 | "columns": [
32 | "name"
33 | ]
34 | }
35 | rule = Rule(**rule_dict)
36 | result = RuleExecutionResult(rule, 'fail', 0, 0, 0)
37 | self.assertEqual(result.status, 'fail')
38 | self.assertEqual(result.rule, rule)
39 | self.assertEqual(result.rule.rule_type, 'unique')
40 |
41 | def test_data_quality(self):
42 | shutil.rmtree(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks")
43 | t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json"
44 | dq = read_json_get_dict(json_path=t1_dq)
45 | dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks"
46 | dq_rules = DataQuality(**dq)
47 | spark = get_or_create_spark_session()
48 | df = spark.read.option('encoding', 'utf-8').json(f"{get_project_root()}/resources/data/input")
49 | execution_result = dq_rules.execute_rules(df=df)
50 | self.assertEqual(execution_result[0], False)
51 | self.assertTrue(execution_result[1].__contains__(''))
52 | dq_rules.write_report_to_html(file_name="task1-dq-report.html")
53 | self.assertTrue(
54 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file())
55 |
56 |
57 | if __name__ == '__main__':
58 | unittest.main()
59 |
--------------------------------------------------------------------------------
/tests/test_helpers.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import getpass
3 | import unittest
4 | from pathlib import Path
5 |
6 | from isodate import ISO8601Error
7 |
8 | from com.hellofresh.datapipelines.recipe_tasks import determine_cooking_difficulty
9 | from com.hellofresh.utils.comprehensive_logging import init_logging
10 | from com.hellofresh.utils.helpers import get_user, get_project_root, convert_iso_to_time_duration, \
11 | add_iso_time_duration
12 |
13 |
14 | class UtilsHelpersTestCases(unittest.TestCase):
15 | init_logging(job_name='UtilsHelpersTestCases')
16 |
17 | def test_get_user(self):
18 | user = get_user()
19 | self.assertEqual(user, getpass.getuser())
20 |
21 | def test_get_project_root(self):
22 | project_root_path: Path = get_project_root()
23 | self.assertEqual(project_root_path.name, 'vim89-data-engineering-test')
24 |
25 | def test_convert_iso_to_time_duration(self):
26 | try:
27 | convert_iso_to_time_duration("")
28 | except ValueError as v:
29 | self.assertEqual(v.__str__(), 'Empty or Invalid time duration string')
30 |
31 | try:
32 | convert_iso_to_time_duration("ABC")
33 | except ISO8601Error as i:
34 | self.assertEqual(i.__str__(), 'Error converting ISO time ABC to timedelta')
35 |
36 | iso_time = convert_iso_to_time_duration("PT100M")
37 | self.assertEqual(iso_time, datetime.timedelta(hours=1, minutes=40))
38 |
39 | iso_time = convert_iso_to_time_duration("PT")
40 | self.assertEqual(iso_time, datetime.timedelta(0))
41 |
42 | def test_add_iso_time_duration(self):
43 | try:
44 | add_iso_time_duration(time1="", time2="PT1H")
45 | except ValueError as v:
46 | self.assertEqual(v.__str__(), 'Empty or Invalid time duration string')
47 |
48 | iso_time = add_iso_time_duration(time1="PT100M", time2="PT1H")
49 | self.assertEqual(iso_time, "PT2H40M")
50 |
51 | iso_time = add_iso_time_duration(time1="PT", time2="PT5M")
52 | self.assertEqual(iso_time, "PT5M")
53 |
54 | iso_time = add_iso_time_duration(time1="PT", time2="PT")
55 | self.assertEqual(iso_time, "P0D")
56 |
57 | def test_determine_difficulty(self):
58 | difficulty = determine_cooking_difficulty(cook_time="PT", prep_time="PT")
59 | self.assertEqual(difficulty, ('P0D', 'easy'))
60 |
61 | difficulty = determine_cooking_difficulty(cook_time="PT21H", prep_time="PT")
62 | self.assertEqual(difficulty, ('PT21H', 'hard'))
63 |
64 | difficulty = determine_cooking_difficulty(cook_time="PT", prep_time="PT100M")
65 | self.assertEqual(difficulty, ('PT1H40M', 'hard'))
66 |
67 |
68 | if __name__ == '__main__':
69 | unittest.main()
70 |
--------------------------------------------------------------------------------
/tests/test_logging_util.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import unittest
3 | import logging.config
4 |
5 | from utils.Utilities import init_logging
6 | from utils.audit_util import audit_action
7 |
8 |
9 | class TestLoggingUtil(unittest.TestCase):
10 | def test_init_logging(self):
11 | init_logging(log_time_stamp=datetime.datetime.now())
12 | level20 = logging.getLogger('simpleExample').level
13 | self.assertEqual(level20, 20)
14 |
15 | def test_audit_action(self):
16 | @audit_action(action=f"testing Audit Action Wrapper")
17 | def audit_decorator():
18 | pass
19 |
20 |
21 | if __name__ == '__main__':
22 | unittest.main()
23 |
--------------------------------------------------------------------------------
/tests/test_recipe_tasks.py:
--------------------------------------------------------------------------------
1 | import shutil
2 | import unittest
3 | from pathlib import Path
4 |
5 | from com.hellofresh.datapipelines.recipe_tasks import main, task1, task2, determine_cooking_difficulty, \
6 | calculate_time_duration_average, standardize_and_rename_df_columns
7 | from com.hellofresh.utils.data_quality import DataQuality
8 | from com.hellofresh.utils.helpers import get_project_root, read_json_get_dict, convert_iso_to_time_duration
9 | from com.hellofresh.utils.spark import get_or_create_spark_session
10 |
11 |
12 | def del_dirs():
13 | try:
14 | shutil.rmtree(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks")
15 | shutil.rmtree(f"{get_project_root()}/resources/data/output/task1")
16 | shutil.rmtree(f"{get_project_root()}/resources/data/output/task2")
17 | except:
18 | pass
19 |
20 |
21 | class RecipeTasksTestCases(unittest.TestCase):
22 |
23 | @classmethod
24 | def setUpClass(self):
25 | self.args = {
26 | 'input_data_dir': f"{get_project_root()}/resources/data/input",
27 | 'output_data_dir': f"{get_project_root()}/resources/data/output"
28 | }
29 | del_dirs()
30 |
31 | @unittest.skip
32 | def test_main(self):
33 | del_dirs()
34 | t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json"
35 | t2_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json"
36 | main(self.args, t1_dq, t2_dq)
37 | self.assertTrue(
38 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file())
39 |
40 | def test_task1(self):
41 | del_dirs()
42 | t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json"
43 | dq = read_json_get_dict(json_path=t1_dq)
44 | dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks"
45 | dq_rules = DataQuality(**dq)
46 | task1(input_data_path=self.args['input_data_dir'], input_file_type='json', dq_rules=dq_rules,
47 | output_data_path=f"{self.args['output_data_dir']}/task1", spark_opts={'encoding': 'utf-8'})
48 |
49 | self.spark = get_or_create_spark_session()
50 | df = self.spark.read.parquet(f"{self.args['output_data_dir']}/task1")
51 | self.assertEqual(df.count(), 1042)
52 | self.assertTrue(df.columns.__contains__('cook_time'))
53 | self.assertTrue(
54 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file())
55 |
56 | def test_task2(self):
57 | t2_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json"
58 | dq = read_json_get_dict(json_path=t2_dq)
59 | dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks"
60 | dq_rules = DataQuality(**dq)
61 | task2(input_data_path=f"{self.args['output_data_dir']}/task1", input_file_type='parquet', dq_rules=dq_rules,
62 | output_data_path=f"{self.args['output_data_dir']}/task2")
63 |
64 | self.spark = get_or_create_spark_session()
65 | df = self.spark.read.csv(f"{self.args['output_data_dir']}/task2", header=True)
66 | self.assertEqual(df.count(), 3)
67 | self.assertTrue(df.columns.__contains__('avg_total_cooking_time'))
68 | self.assertTrue(
69 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task2-dq-report.html").is_file())
70 |
71 | def test_determine_cooking_difficulty(self):
72 | difficulty = determine_cooking_difficulty("PT1H", "PT2M")
73 | self.assertEqual(difficulty, ('PT1H2M', 'hard'))
74 | difficulty = determine_cooking_difficulty("PT5M", "PT15M")
75 | self.assertEqual(difficulty, ('PT20M', 'easy'))
76 | difficulty = determine_cooking_difficulty("PT15M", "PT20M")
77 | self.assertEqual(difficulty, ('PT35M', 'medium'))
78 | difficulty = determine_cooking_difficulty("PT", "PT")
79 | self.assertEqual(difficulty, ('P0D', 'easy'))
80 |
81 | try:
82 | difficulty = determine_cooking_difficulty("", "PT1H")
83 | print(difficulty)
84 | except Exception as ex:
85 | self.assertEqual(ex.__str__(), 'Expecting a string None')
86 |
87 | def test_calculate_time_duration_average(self):
88 | list_of_time_duration = list(map(lambda t: convert_iso_to_time_duration(t), ["PT1H", "PT30M", "PT", "PT2H5M"]))
89 | avg = calculate_time_duration_average(list_of_time_duration)
90 | self.assertEqual(avg, 'PT53M45S')
91 |
92 |
93 | if __name__ == '__main__':
94 | unittest.main()
95 |
--------------------------------------------------------------------------------
/tests/test_spark.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from pyspark.sql import SparkSession
4 |
5 | from com.hellofresh.utils.comprehensive_logging import init_logging
6 | from com.hellofresh.utils.spark import get_or_create_spark_session, standardize_and_rename_df_columns, \
7 | read_data_as_spark_dataframe, data_frame_repartition
8 |
9 |
10 | class UtilsSparkTestCases(unittest.TestCase):
11 | init_logging(job_name='UtilsSparkTestCases')
12 |
13 | def test_create_spark_session(self):
14 | spark: SparkSession = get_or_create_spark_session()
15 | self.assertIsNot(spark, None)
16 | self.assertEqual(spark, SparkSession.getActiveSession())
17 | self.assertEqual(spark.sparkContext.appName.__str__(), 'pyspark-shell')
18 |
19 | def test_standardize_and_rename_df_columns(self):
20 | spark = get_or_create_spark_session()
21 | data = [('Category A', 100, "This is category A"),
22 | ('Category B', 120, "This is category B"),
23 | ('Category C', 150, "This is category C")]
24 | df = spark.sparkContext.parallelize(data).toDF(['cateGory ', ' iD ', 'category description'])
25 |
26 | self.assertEqual(df.columns, ['cateGory ', ' iD ', 'category description'])
27 |
28 | df = standardize_and_rename_df_columns(df=df,
29 | column_names_to_rename={'category description': 'category_description'})
30 | self.assertEqual(df.columns, ['category', 'id', 'category_description'])
31 |
32 | def test_negative_cases_for_read_data_as_spark_dataframe(self):
33 |
34 | # INVALID
35 | try:
36 | df = read_data_as_spark_dataframe(filetype='invalid', location='a://a.txt')
37 | except Exception as ex:
38 | print(ex.__str__())
39 | self.assertEqual(ex.__str__(), 'Invalid filetype: invalid')
40 |
41 | # CSV
42 | try:
43 | df = read_data_as_spark_dataframe(filetype='csv', location='a://a.csv')
44 | csv_read = 'successful'
45 | except Exception as ex:
46 | csv_read = 'failed'
47 |
48 | self.assertAlmostEqual(csv_read, 'failed')
49 |
50 | # TEXT
51 | try:
52 | df = read_data_as_spark_dataframe(filetype='text', location='a://a.txt')
53 | text_read = 'successful'
54 | except Exception as ex:
55 | text_read = 'failed'
56 |
57 | self.assertAlmostEqual(text_read, 'failed')
58 |
59 | # XML
60 | try:
61 | df = read_data_as_spark_dataframe(filetype='xml', location='a://a.xml')
62 | xml_read = 'successful'
63 | except Exception as ex:
64 | xml_read = 'failed'
65 |
66 | self.assertAlmostEqual(xml_read, 'failed')
67 |
68 | # Table
69 | try:
70 | df = read_data_as_spark_dataframe(filetype='table', location='a://a.xml')
71 | table_read = 'successful'
72 | except Exception as ex:
73 | table_read = 'failed'
74 |
75 | self.assertAlmostEqual(table_read, 'failed')
76 |
77 | def test_data_frame_repartition(self):
78 | spark = get_or_create_spark_session()
79 | data = [('Category A', 100, "This is category A"),
80 | ('Category B', 120, "This is category B"),
81 | ('Category C', 150, "This is category C")]
82 | df = spark.sparkContext.parallelize(data).toDF(['category', 'id', 'category_description'])
83 |
84 | df = data_frame_repartition(df=df, use_coalesce=True, num_files=1)
85 | self.assertTrue(df is not None)
86 |
87 | df = data_frame_repartition(df=df, num_files=5, repartition_columns=['category'])
88 | self.assertFalse(df.columns.__contains__('temp_repartition_column'))
89 |
90 |
91 | if __name__ == '__main__':
92 | unittest.main()
93 |
--------------------------------------------------------------------------------
/tests/test_spark_submit_execution_pool.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | from datetime import datetime
3 |
4 | from utils.Utilities import init_logging, create_multiprocess_pool, execute_bash
5 |
6 |
7 | class TestSparkSubmitExecutionPool(unittest.TestCase):
8 | init_logging(datetime.now())
9 |
10 | def test_create_multiprocess_pool(self):
11 | bash_commands = [
12 | 'echo "cmd1"',
13 | 'echo "cmd2"',
14 | 'echo "cmd3"',
15 | 'hadoop version',
16 | 'echo "cmd5"',
17 | 'echo "cmd6"',
18 | 'echo "cmd7"',
19 | 'echo "cmd8"',
20 | 'echo "cmd9"',
21 | 'echo "cmd10"',
22 | 'echo "cmd11"'
23 | ]
24 | results, failures = create_multiprocess_pool(
25 | shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
26 | command_list=bash_commands,
27 | sleep_time=0,
28 | max_parallel_jobs=6
29 | )
30 |
31 | bash_commands.append('spark-submit')
32 | _results, _failures = create_multiprocess_pool(
33 | shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
34 | command_list=bash_commands,
35 | sleep_time=0,
36 | max_parallel_jobs=6
37 | )
38 |
39 | self.assertEqual(len(failures), 0)
40 | self.assertEqual(len(_failures) > 0, True)
41 |
42 | def test_execute_bash(self):
43 | pid, return_code, yarn_application_id, stdout, stderr = \
44 | execute_bash(shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
45 | sleep_time=0, cmd='hadoop version')
46 |
47 | _pid, _return_code, _yarn_application_id, _stdout, _stderr = \
48 | execute_bash(shared_data={'log_timestamp': datetime.now().isoformat().__str__()},
49 | sleep_time=0, cmd='spark-submit')
50 | self.assertNotEqual(pid, None)
51 | self.assertNotEqual(stderr, None)
52 | self.assertNotEqual(stdout, None)
53 |
54 | self.assertEqual(len(yarn_application_id), 0)
55 | self.assertEqual(return_code == 0, True)
56 |
57 | self.assertNotEqual(_pid, None)
58 | self.assertNotEqual(_stderr, None)
59 | self.assertNotEqual(_stdout, None)
60 |
61 | self.assertEqual(len(_yarn_application_id), 0)
62 | self.assertEqual(_return_code > 0, True)
63 |
64 |
65 | if __name__ == '__main__':
66 | unittest.main()
67 |
--------------------------------------------------------------------------------
/tests/test_spark_submit_utils.py:
--------------------------------------------------------------------------------
1 | import datetime
2 | import logging
3 | import unittest
4 |
5 | from utils.Utilities import init_logging, cast_string_to_date, get_project_root, read_json_get_dict, read_yaml_get_dict
6 | from utils.spark_submit_utils import prepare_spark_submit
7 |
8 |
9 | class TestSparkSubmitUtils(unittest.TestCase):
10 | init_logging(datetime.datetime.now())
11 |
12 | def test_get_project_root(self):
13 | self.assertEqual(get_project_root().__str__(), '/Users/v0m02sj/PycharmProjects/datapipelines-essentials')
14 |
15 | def test_cast_string_to_date(self):
16 | dt = cast_string_to_date('2020-01-01', '%Y-%m-%d')
17 | _dt = cast_string_to_date('abcdefg', '%Y-%m-%d')
18 | self.assertEqual(type(dt), datetime.datetime)
19 | self.assertEqual(_dt, None)
20 |
21 | def test_prepare_spark_submit_command(self):
22 | application_properties = read_json_get_dict(
23 | json_path=f"{get_project_root()}/main/src/resources/config/application_properties.json")
24 | runtime_args = {} # parse_arguments(application_properties.get('command_line_args'))
25 | runtime_args.update({
26 | "workflow": "DVSkuDailyChannelWorkFlow",
27 | "refreshType": "history",
28 | "startDate": "2020-01-01",
29 | "endDate": "2020-01-10",
30 | "dq_enabled": "Y",
31 | "configFile": "/Users/v0m02sj/IdeaProjects/channel-perf-data-pipeline/configs/config-prod.yml"
32 | })
33 | static_args = read_yaml_get_dict(runtime_args['configFile'])
34 | runtime_args.update({'configFile': runtime_args['configFile'].split('/')[-1]})
35 | commands = prepare_spark_submit(runtime_args=runtime_args, config_args=static_args,
36 | app_config=application_properties)
37 | logging.debug(commands)
38 | self.assertEqual(len(commands) > 0, True)
39 |
40 |
41 | if __name__ == '__main__':
42 | unittest.main()
43 |
--------------------------------------------------------------------------------