├── .coveragerc ├── .gitignore ├── LICENSE ├── MANIFEST.in ├── README.md ├── conf ├── data-quality │ ├── example-dq-report.html │ └── rules │ │ ├── production_configs │ │ ├── recipe-task1-dq-rules.json │ │ └── recipe-task2-dq-rules.json │ │ └── unit_test_configs │ │ ├── recipe-task1-dq-rules.json │ │ └── recipe-task2-dq-rules.json ├── python │ └── logging-properties.json └── spark │ ├── log4j.properties │ └── sparkConf.conf ├── docs ├── APIDOC.MD ├── ETL_README.md ├── PysparkLocalSetup.docx ├── SETUP.MD ├── apidocumentation.html ├── images │ ├── DataQualityUML.png │ ├── XMLParse.png │ ├── dq-task1.png │ ├── dq-task2.png │ ├── task1_ouput_er.png │ └── task2_ouput_er.png └── setup.html ├── logs ├── bash │ └── logs └── python │ └── log-sample ├── requirements.txt ├── resources ├── data-quality-reports │ └── recipe-tasks │ │ ├── task1-dq-report.html │ │ └── task2-dq-report.html └── data │ ├── clinical_trial │ ├── data │ │ └── chunk1.zip │ ├── job_parameters │ │ └── clinical_trial.json │ ├── sql │ │ └── transformations │ │ │ └── sponsors.sql │ └── xml │ │ ├── clinical_study_xsd.xsd │ │ └── default_clinical_study.xml │ ├── config │ ├── application_properties.json │ ├── application_properties.yaml │ └── logging.yaml │ ├── product.csv │ ├── purchase.csv │ ├── recipes │ ├── input │ │ ├── recipes-000.json │ │ ├── recipes-001.json │ │ └── recipes-002.json │ └── output │ │ ├── task1 │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc │ │ ├── _SUCCESS │ │ └── part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet │ │ └── task2 │ │ ├── ._SUCCESS.crc │ │ ├── .part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc │ │ ├── _SUCCESS │ │ └── part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv │ └── store.csv ├── sbin ├── common_functions.sh ├── create_python_venv.sh └── execute-tasks-spark-submit.sh ├── setup.py ├── src └── com │ ├── __init__.py │ └── vitthalmirji │ ├── __init__.py │ ├── datapipelines │ ├── __init__.py │ ├── clinical_trial │ │ ├── __init__.py │ │ └── clinical_trial_etl.py │ └── recipe_tasks.py │ ├── datawarehousing │ ├── __init__.py │ └── change_data_capture.py │ ├── etl │ ├── CColumn.py │ ├── ETL.py │ ├── ETLTransform.py │ ├── ITable.py │ ├── __init__.py │ └── meta │ │ ├── MetaModel.py │ │ └── __init__.py │ ├── imports │ ├── HdfsImport.py │ └── __init__.py │ ├── kafka │ ├── Logger.py │ └── __init__.py │ ├── main.py │ ├── mapper │ ├── Mapper.py │ └── __init__.py │ ├── objects │ ├── __init__.py │ └── enums │ │ ├── Environments.py │ │ ├── Zones.py │ │ └── __init__.py │ └── utils │ ├── MockupData.py │ ├── Utilities.py │ ├── __init__.py │ ├── audit_util.py │ ├── comprehensive_logging.py │ ├── constants.py │ ├── data_quality.py │ ├── helpers.py │ ├── logging_util.py │ ├── spark.py │ ├── spark_submit_utils.py │ └── transformation_extension.py └── tests ├── EtlTransformTest.py ├── UtilsTest.py ├── XmlMapperTest.py ├── aws_test ├── AwsS3Test.py ├── __init__.py ├── glue_job.py ├── test_glue_job.py ├── test_mocked_postgres.py ├── test_mocked_redshift.py ├── test_mocked_redshift_infra.py └── testing_mocked_s3.py ├── resources ├── config.yml ├── datamodel.csv ├── meta.csv ├── mock_dataframe.txt ├── product.csv ├── purchase.csv └── store.csv ├── test_comprehensive_logging.py ├── test_data_quality.py ├── test_helpers.py ├── test_logging_util.py ├── test_recipe_tasks.py ├── test_spark.py ├── test_spark_submit_execution_pool.py └── test_spark_submit_utils.py /.coveragerc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/.coveragerc -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ### Python template 2 | 3 | # Bash script logs 4 | *.log 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | pip-wheel-metadata/ 26 | share/python-wheels/ 27 | *.egg-info/ 28 | .installed.cfg 29 | *.egg 30 | MANIFEST 31 | src/test/metastore_db 32 | src/src/main/test/hive 33 | src/test/spark-warehouse 34 | src/test/derby.log 35 | 36 | # PyInstaller 37 | # Usually these files are written by a python script from a template 38 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 39 | *.manifest 40 | *.spec 41 | 42 | # Installer logs 43 | pip-log.txt 44 | pip-delete-this-directory.txt 45 | 46 | # Unit test / coverage reports 47 | htmlcov/ 48 | .tox/ 49 | .nox/ 50 | .coverage 51 | .coverage.* 52 | .cache 53 | nosetests.xml 54 | coverage.xml 55 | *.cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # PyBuilder 64 | target/ 65 | 66 | # pyenv 67 | .python-version 68 | 69 | # Environments 70 | .env 71 | .venv 72 | env/ 73 | venv/ 74 | ENV/ 75 | env.bak/ 76 | venv.bak/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include sbin *.sh 2 | recursive-include conf *.html *.json *.conf *.properties *.html 3 | recursive-include resources *.html 4 | recursive-include logs logs log-sample 5 | recursive-include docs *.md 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Datalake ETL Pipeline 2 | Data transformation simplified for any Data platform. 3 | 4 | `Features:` The package has complete ETL process - 5 | 1. Uses metadata, transformation & data model information to design ETL pipeline 6 | 2. Builds target transformation SparkSQL and Spark Dataframes 7 | 3. Builds source & target Hive DDLs 8 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions. 9 | 5. Supports below fundamental transformations for ETL pipeline - 10 | * Filters on source & target dataframes 11 | * Grouping and Aggregations on source & target dataframes 12 | * Heavily nested queries / dataframes 13 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth 14 | level of nesting 15 | 7. Has Unit test cases designed on function/method level & measures 16 | source code coverage 17 | 8. Has information about delpoying to higher environments 18 | 9. Has API documentation for customization & enhancement 19 | 20 | `Enhancements:` In progress - 21 | 1. Integrate Audit and logging - Define Error codes, log process 22 | failures, Audit progress & runtime information -------------------------------------------------------------------------------- /conf/data-quality/example-dq-report.html: -------------------------------------------------------------------------------- 1 |

Team,

Data Quality check finished successfully for DQ ID = 101, with failures. Check details in below table of metrics.

Failed DQ details

Yarn Application Id DQ ID Rule ID Rule Name Rule type Description Columns/Query Pass Count Fail Count Total Count
local-1681916910001 101 1011 Primary / Natural Keys unique Primary / Natural Keys should not have duplicates ['name'] 1039 3 1042
local-1681916910001 101 1012 NOT NULL fields not null Field should have valid value ['name', 'cookTime', 'prepTime'] 715 327 1042

Succeeded DQ details

Yarn Application Id DQ ID Rule ID Rule Name Rule type Description Columns/Query Pass Count Fail Count Total Count
local-1681916910001 101 1013 File names check query Check If all input files are read for processing ["WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL"] 1042 0 1042


Thanks
-------------------------------------------------------------------------------- /conf/data-quality/rules/production_configs/recipe-task1-dq-rules.json: -------------------------------------------------------------------------------- 1 | { 2 | "dq_id": 101, 3 | "rules": [ 4 | { 5 | "rule_id": 1011, 6 | "name": "Primary / Natural Keys", 7 | "description": "Primary / Natural Keys should not have duplicates", 8 | "rule_type": "unique", 9 | "columns": [ 10 | "name" 11 | ] 12 | }, 13 | { 14 | "rule_id": 1012, 15 | "name": "NOT NULL fields", 16 | "description": "Field should have valid value", 17 | "rule_type": "not null", 18 | "columns": [ 19 | "name", 20 | "cookTime", 21 | "prepTime" 22 | ] 23 | }, 24 | { 25 | "rule_id": 1013, 26 | "name": "Input files check", 27 | "description": "Check If all input files are read for processing", 28 | "rule_type": "query", 29 | "query": "WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL" 30 | }, 31 | { 32 | "rule_id": 1014, 33 | "name": "\"Check for invalid cook & prep time", 34 | "description": "Check empty or null values", 35 | "rule_type": "query", 36 | "query": "SELECT * FROM temp WHERE cookTime = '' OR prepTime = ''" 37 | } 38 | ] 39 | } 40 | -------------------------------------------------------------------------------- /conf/data-quality/rules/production_configs/recipe-task2-dq-rules.json: -------------------------------------------------------------------------------- 1 | { 2 | "dq_id": 101, 3 | "rules": [ 4 | { 5 | "rule_id": 1015, 6 | "name": "Primary / Natural Keys", 7 | "description": "Primary / Natural Keys should not have duplicates", 8 | "rule_type": "unique", 9 | "columns": [ 10 | "difficulty" 11 | ] 12 | }, 13 | { 14 | "rule_id": 1016, 15 | "name": "NOT NULL fields", 16 | "description": "Field should have valid value", 17 | "rule_type": "not null", 18 | "columns": [ 19 | "difficulty", 20 | "avg_total_cooking_time" 21 | ] 22 | } 23 | ] 24 | } 25 | -------------------------------------------------------------------------------- /conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json: -------------------------------------------------------------------------------- 1 | { 2 | "dq_id": 101, 3 | "execution_reports_dir": "/resources/data-quality-reports/recipe-tasks", 4 | "email_execution_report_to": "vitthalmirji@gmail.com", 5 | "rules": [ 6 | { 7 | "rule_id": 1011, 8 | "name": "Primary / Natural Keys", 9 | "description": "Primary / Natural Keys should not have duplicates", 10 | "rule_type": "unique", 11 | "columns": [ 12 | "name" 13 | ] 14 | }, 15 | { 16 | "rule_id": 1012, 17 | "name": "NOT NULL fields", 18 | "description": "Field should have valid value", 19 | "rule_type": "not null", 20 | "columns": [ 21 | "name", 22 | "cookTime", 23 | "prepTime" 24 | ] 25 | }, 26 | { 27 | "rule_id": 1013, 28 | "name": "Input files check", 29 | "description": "Check If all input files are read for processing", 30 | "rule_type": "query", 31 | "query": "WITH file_names AS (SELECT 'recipes-000.json' AS file_name UNION SELECT 'recipes-001.json' AS file_name UNION SELECT 'recipes-002.json' AS file_name)\nSELECT f.file_name FROM file_names f\nLEFT JOIN (SELECT DISTINCT reverse(split(input_file_name(), '/'))[0] as file_name FROM temp) t\nON t.file_name = f.file_name\nWHERE t.file_name IS NULL" 32 | }, 33 | { 34 | "rule_id": 1014, 35 | "name": "\"Check for invalid cook & prep time", 36 | "description": "Check empty or null values", 37 | "rule_type": "query", 38 | "query": "SELECT * FROM temp WHERE cookTime = '' OR prepTime = ''" 39 | } 40 | ] 41 | } 42 | -------------------------------------------------------------------------------- /conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json: -------------------------------------------------------------------------------- 1 | { 2 | "dq_id": 101, 3 | "execution_reports_dir": "/resources/data-quality-reports/recipe-tasks", 4 | "email_execution_report_to": "vitthalmirji@gmail.com", 5 | "rules": [ 6 | { 7 | "rule_id": 1015, 8 | "name": "Primary / Natural Keys", 9 | "description": "Primary / Natural Keys should not have duplicates", 10 | "rule_type": "unique", 11 | "columns": [ 12 | "difficulty" 13 | ] 14 | }, 15 | { 16 | "rule_id": 1016, 17 | "name": "NOT NULL fields", 18 | "description": "Field should have valid value", 19 | "rule_type": "not null", 20 | "columns": [ 21 | "difficulty", 22 | "avg_total_cooking_time" 23 | ] 24 | } 25 | ] 26 | } 27 | -------------------------------------------------------------------------------- /conf/python/logging-properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "objects": { 4 | "queue": { 5 | "class": "queue.Queue", 6 | "maxsize": 1000 7 | } 8 | }, 9 | "formatters": { 10 | "simple": { 11 | "format": "%(asctime)s - %(name)s - %(levelname)s - %(message)s" 12 | }, 13 | "detailed": { 14 | "format": "%(asctime)s %(name)-15s %(levelname)-8s %(process)-10d %(funcName)-30s %(message)s" 15 | } 16 | }, 17 | "handlers": { 18 | "console": { 19 | "class": "logging.StreamHandler", 20 | "level": "DEBUG", 21 | "formatter": "detailed", 22 | "stream": "ext://sys.stdout" 23 | }, 24 | "file": { 25 | "class": "logging.FileHandler", 26 | "level": "DEBUG", 27 | "encoding": "utf-8", 28 | "formatter": "detailed", 29 | "filename": "logs/log-{job_name_placeholder}_{timestamp_placeholder}.log", 30 | "mode": "a" 31 | } 32 | }, 33 | "loggers": { 34 | "simple": { 35 | "level": "INFO", 36 | "handlers": [ 37 | "console" 38 | ], 39 | "propagate": "no" 40 | }, 41 | "unit-tests": { 42 | "level": "DEBUG", 43 | "handlers": [ 44 | "console" 45 | ], 46 | "propagate": "no" 47 | } 48 | }, 49 | "root": { 50 | "level": "DEBUG", 51 | "handlers": [ 52 | "console", 53 | "file" 54 | ] 55 | } 56 | } 57 | -------------------------------------------------------------------------------- /conf/spark/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | #Global logging 3 | log4j.rootCategory=WARN, console 4 | log4j.appender.console=org.apache.log4j.ConsoleAppender 5 | log4j.appender.console.target=System.err 6 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 7 | log4j.appender.console.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 8 | 9 | # Spark 3.x 10 | log4j.logger.org.sparkproject.jetty.server.handler.ContextHandler=WARN 11 | 12 | # Spark 2.x 13 | log4j.logger.org.spark_project.jetty.server.handler.ContextHandler=WARN 14 | 15 | # Send WARN or higher to stderr 16 | log4j.appender.stderr=org.apache.log4j.ConsoleAppender 17 | log4j.appender.stderr.Threshold=WARN 18 | log4j.appender.stderr.Target=System.err 19 | log4j.appender.stderr.layout=org.apache.log4j.PatternLayout 20 | log4j.appender.stderr.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 21 | 22 | # Parquet related logging 23 | log4j.logger.parquet.name = org.apache.parquet.CorruptStatistics 24 | log4j.logger.parquet.level = WARN 25 | log4j.logger.parquet2.name = parquet.CorruptStatistics 26 | log4j.logger.parquet2.level = WARN 27 | 28 | # Hive metastore related logging 29 | logger.metastore.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler 30 | logger.metastore.level = FATAL 31 | logger.hive_functionregistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry 32 | logger.hive_functionregistry.level = ERROR 33 | 34 | # Settings to quiet third party logs that are too verbose 35 | log4j.logger.org.eclipse.jetty=WARN 36 | log4j.logger.org.eclipse.jetty.util.component.AbstractLifeCycle=ERROR 37 | log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO 38 | log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO 39 | 40 | # Reduce verbosity for other spammy core classes. 41 | log4j.logger.org.apache.spark=WARN 42 | log4j.logger.org.apache.spark.util=ERROR 43 | log4j.logger.org.apache.spark.network=WARN 44 | log4j.logger.akka=WARN 45 | log4j.logger.org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter=WARN 46 | log4j.logger.org.apache.hadoop.conf.Configuration.deprecation=WARN 47 | 48 | # Hello Fresh com.vitthalmirji logging into separate file 49 | log4j.logger.com.vitthalmirji=INFO, vimAppender 50 | log4j.additivity.com.vitthalmirji=false 51 | log4j.appender.vimAppender=org.apache.log4j.FileAppender 52 | log4j.appender.vimAppender.File=${spark.yarn.app.container.log.dir}/stdout 53 | log4j.appender.vimAppender.layout=org.apache.log4j.PatternLayout 54 | log4j.appender.vimAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 55 | 56 | # Spark Bigquery logging into separate file 57 | log4j.logger.com.google.cloud.spark.bigquery=INFO, sparkbigqueryAppender 58 | log4j.additivity.com.google.cloud.spark.bigquery=false 59 | log4j.appender.sparkbigqueryAppender=org.apache.log4j.FileAppender 60 | log4j.appender.sparkbigqueryAppender.File=${spark.yarn.app.container.log.dir}/spark-big-query.log 61 | log4j.appender.sparkbigqueryAppender.layout=org.apache.log4j.PatternLayout 62 | log4j.appender.sparkbigqueryAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 63 | 64 | # Bigquery logging into separate file 65 | log4j.logger.com.google.cloud.bigquery=INFO, bigqueryAppender 66 | log4j.additivity.com.google.cloud.bigquery=false 67 | log4j.appender.bigqueryAppender=org.apache.log4j.FileAppender 68 | log4j.appender.bigqueryAppender.File=${spark.yarn.app.container.log.dir}/big-query.log 69 | log4j.appender.bigqueryAppender.layout=org.apache.log4j.PatternLayout 70 | log4j.appender.bigqueryAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 71 | 72 | # Hudi logging into separate file 73 | log4j.logger.org.apache.hudi=INFO, hudiAppender 74 | log4j.additivity.org.apache.hudi=false 75 | log4j.appender.hudiAppender=org.apache.log4j.FileAppender 76 | log4j.appender.hudiAppender.File=${spark.yarn.app.container.log.dir}/hudi.log 77 | log4j.appender.hudiAppender.layout=org.apache.log4j.PatternLayout 78 | log4j.appender.hudiAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 79 | 80 | # Cosmos logging into separate file 81 | log4j.logger.com.microsoft.azure.cosmosdb=INFO, cosmosdbAppender 82 | log4j.additivity.com.microsoft.azure.cosmosdb=false 83 | log4j.appender.cosmosdbAppender=org.apache.log4j.FileAppender 84 | log4j.appender.cosmosdbAppender.File=${spark.yarn.app.container.log.dir}/cosmosdb.log 85 | log4j.appender.cosmosdbAppender.layout=org.apache.log4j.PatternLayout 86 | log4j.appender.cosmosdbAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 87 | 88 | # GCS logging into separate file 89 | log4j.logger.com.google.cloud.storage=INFO, gcsAppender 90 | log4j.additivity.com.google.cloud.storage=false 91 | log4j.appender.gcsAppender=org.apache.log4j.FileAppender 92 | log4j.appender.gcsAppender.File=${spark.yarn.app.container.log.dir}/gcs.log 93 | log4j.appender.gcsAppender.layout=org.apache.log4j.PatternLayout 94 | log4j.appender.gcsAppender.layout.ConversionPattern=%d{yy-MM-dd HH:mm:ss} %p %c{1}: %m%n 95 | -------------------------------------------------------------------------------- /conf/spark/sparkConf.conf: -------------------------------------------------------------------------------- 1 | GLOBAL { 2 | "master" = "yarn" 3 | "hive.exec.dynamic.partition.mode" = "nonstrict" 4 | "hive.exec.dynamic.partition" = "true" 5 | "spark.sql.sources.partitionOverwriteMode" = "dynamic" 6 | "mapreduce.fileoutputcommitter.algorithm.version" = "2" 7 | "parquet.enable.summary-metadata" = "false" 8 | "parquet.compression" = "snappy" 9 | "spark.sql.parquet.mergeSchema" = "false" 10 | "spark.sql.parquet.filterPushdown" = "true" 11 | "spark.sql.hive.metastorePartitionPruning" = "true" 12 | "spark.sql.orc.filterPushdown" = "true" 13 | "spark.sql.orc.splits.include.file.footer" = "true" 14 | "spark.sql.orc.cache.stripe.details.size" = "10000" 15 | "spark.sql.broadcastTimeout" = "1800" 16 | } 17 | 18 | LOCAL { 19 | "master" = "local[*]" 20 | "spark.hadoop.hive.exec.dynamic.partition.mode" = "nonstrict" 21 | "spark.hadoop.hive.exec.dynamic.partition" = "true" 22 | "spark.sql.sources.partitionOverwriteMode" = "dynamic" 23 | "spark.executor.instances" = "1" 24 | } 25 | -------------------------------------------------------------------------------- /docs/APIDOC.MD: -------------------------------------------------------------------------------- 1 | # Datalake ETL Pipeline 2 | Data transformation simplified for any Data platform. 3 | 4 | `Features:` The package has complete ETL process - 5 | 1. Uses metadata, transformation & data model information to design ETL pipeline 6 | 2. Builds target transformation SparkSQL and Spark Dataframes 7 | 3. Builds source & target Hive DDLs 8 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions. 9 | 5. Supports below fundamental transformations for ETL pipeline - 10 | * Filters on source & target dataframes 11 | * Grouping and Aggregations on source & target dataframes 12 | * Heavily nested queries / dataframes 13 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth 14 | level of nesting 15 | 7. Has Unit test cases designed on function/method level & measures 16 | source code coverage 17 | 8. Has information about delpoying to higher environments 18 | 9. Has API documentation for customization & enhancement 19 | 20 | `Enhancements:` In progress - 21 | 1. Integrate Audit and logging - Define Error codes, log process 22 | failures, Audit progress & runtime information 23 | 24 | # Datalake ETL Pipeline API documentation 25 | ## Mappers for complex/nested data sources 26 | * Has interface `IMapper` and implemented concrete class `XmlMapper`. We 27 | can use same abstract / interface for other category of file mapping 28 | viz. XML/JSON/Parquet/ORC. 29 | * Core methods/function common for overriding 30 | are – `getDataframeSchema`, `createDDL`, `complexTypeIterator`, 31 | `handleStructType`, `handleArrayType` 32 | 33 | * Overview of complex type parsing & exploding - 34 | * ![Complex type parser](images/XMLParse.png) 35 | ``` 36 | def handleStructType(self, viewname, viewpath, database, table, xpath, level, dtype, acc={}, xpaths=[]) 37 | ``` 38 | ``` 39 | def handleArrayType(self, viewname, viewpath, database, table, xpath, level, dtype: ArrayType, acc={}, xpaths=[]) 40 | ``` 41 | ``` 42 | def complexTypeIterator(self, viewname, viewpath, database, table, xpath, level, dtype: DataType, acc={}, xpaths=[]) 43 | ``` 44 | 45 | ### XmlMapper 46 | * `XmlMapper` specific methods / functions – `createViewsAndXpaths`, 47 | `buildXmlSerdeDDL` 48 | 49 | ``` 50 | def createViewsAndXpaths(self, df: DataFrame, database, table) 51 | ``` 52 | ``` 53 | def buildXmlSerdeDdl(self, database, table, xmlsourcelocation, xmlrowstarttag, xmlrowendtag) 54 | ``` 55 | 56 | ## Pyspark Core Class Extensions 57 | 58 | ``` 59 | from etl.meta import * 60 | ``` 61 | 62 | ### Column Extensions 63 | 64 | **isFalsy()** 65 | 66 | ```python 67 | source_df.withColumn("is_stuff_falsy", F.col("has_stuff").isFalsy()) 68 | ``` 69 | 70 | Returns `True` if `has_stuff` is `None` or `False`. 71 | 72 | **isTruthy()** 73 | 74 | ```python 75 | source_df.withColumn("is_stuff_truthy", F.col("has_stuff").isTruthy()) 76 | ``` 77 | 78 | Returns `True` unless `has_stuff` is `None` or `False`. 79 | 80 | **isNullOrBlank()** 81 | 82 | ```python 83 | source_df.withColumn("is_blah_null_or_blank", F.col("blah").isNullOrBlank()) 84 | ``` 85 | 86 | Returns `True` if `blah` is `null` or blank (the empty string or a string that only contains whitespace). 87 | 88 | **isNotIn()** 89 | 90 | ```python 91 | source_df.withColumn("is_not_bobs_hobby", F.col("fun_thing").isNotIn(bobs_hobbies)) 92 | ``` 93 | 94 | Returns `True` if `fun_thing` is not included in the `bobs_hobbies` list. 95 | 96 | **nullBetween()** 97 | 98 | ```python 99 | source_df.withColumn("is_between", F.col("age").nullBetween(F.col("lower_age"), F.col("upper_age"))) 100 | ``` 101 | 102 | Returns `True` if `age` is between `lower_age` and `upper_age`. If `lower_age` is populated and `upper_age` is `null`, it will return `True` if `age` is greater than or equal to `lower_age`. If `lower_age` is `null` and `upper_age` is populate, it will return `True` if `age` is lower than or equal to `upper_age`. 103 | 104 | ### SparkSession Extensions 105 | 106 | **create_df()** 107 | 108 | ```python 109 | spark.create_df( 110 | [("jose", "a"), ("li", "b"), ("sam", "c")], 111 | [("name", StringType(), True), ("blah", StringType(), True)] 112 | ) 113 | ``` 114 | 115 | Creates DataFrame with a syntax that's less verbose than the built-in `createDataFrame` method. 116 | 117 | ### DataFrame Extensions 118 | 119 | **applyTransform()** 120 | 121 | ```python 122 | source_df\ 123 | .applyTransform(lambda df: with_greeting(df))\ 124 | .applyTransform(lambda df: with_something(df, "crazy")) 125 | ``` 126 | 127 | Allows for multiple DataFrame transformations to be run and executed. 128 | 129 | ## Helper Functions 130 | 131 | ```python 132 | 133 | import etl 134 | ``` 135 | 136 | ### DataFrame Validations 137 | 138 | **validatePresenceOfColumns()** 139 | 140 | ```python 141 | etl.meta.validatePresenceOfColumns(source_df, ["name", "age", "fun"]) 142 | ``` 143 | 144 | Raises an exception unless `source_df` contains the `name`, `age`, and `fun` column. 145 | 146 | **validateSchema()** 147 | 148 | ```python 149 | etl.meta.validateSchema(source_df, required_schema) 150 | ``` 151 | 152 | Raises an exception unless `source_df` contains all the `StructFields` defined in the `required_schema`. 153 | 154 | **validateAbsenseOfColumns()** 155 | 156 | ```python 157 | etl.meta.validateAbsenseOfColumns(source_df, ["age", "cool"]) 158 | ``` 159 | 160 | Raises an exception if `source_df` contains `age` or `cool` columns. 161 | 162 | ### Functions 163 | 164 | **single_space()** 165 | 166 | ```python 167 | actual_df = source_df.withColumn( 168 | "words_single_spaced", 169 | etl.meta.single_space(col("words")) 170 | ) 171 | ``` 172 | 173 | 174 | Replaces all multispaces with single spaces (e.g. changes `"this has some"` to `"this has some"`. 175 | 176 | **remove_all_whitespace()** 177 | 178 | ```python 179 | actual_df = source_df.withColumn( 180 | "words_without_whitespace", 181 | etl.meta.remove_all_whitespace(col("words")) 182 | ) 183 | ``` 184 | 185 | Removes all whitespace in a string (e.g. changes `"this has some"` to `"thishassome"`. 186 | 187 | **anti_trim()** 188 | 189 | ```python 190 | actual_df = source_df.withColumn( 191 | "words_anti_trimmed", 192 | etl.meta.anti_trim(col("words")) 193 | ) 194 | ``` 195 | 196 | Removes all inner whitespace, but doesn't delete leading or trailing whitespace (e.g. changes `" this has some "` to `" thishassome "`. 197 | 198 | **remove_non_word_characters()** 199 | 200 | ```python 201 | actual_df = source_df.withColumn( 202 | "words_without_nonword_chars", 203 | etl.meta.remove_non_word_characters(col("words")) 204 | ) 205 | ``` 206 | 207 | Removes all non-word characters from a string (e.g. changes `"si%$#@!#$!@#mpsons"` to `"simpsons"`. 208 | 209 | **exists()** 210 | 211 | ```python 212 | source_df.withColumn( 213 | "any_num_greater_than_5", 214 | etl.meta.exists(lambda n: n > 5)(col("nums")) 215 | ) 216 | ``` 217 | 218 | `nums` contains lists of numbers and `exists()` returns `True` if any of the numbers in the list are greater than 5. It's similar to the Python `any` function. 219 | 220 | **forall()** 221 | 222 | ```python 223 | source_df.withColumn( 224 | "all_nums_greater_than_3", 225 | etl.meta.forall(lambda n: n > 3)(col("nums")) 226 | ) 227 | ``` 228 | 229 | `nums` contains lists of numbers and `forall()` returns `True` if all of the numbers in the list are greater than 3. It's similar to the Python `all` function. 230 | 231 | **multi_equals()** 232 | 233 | ```python 234 | source_df.withColumn( 235 | "are_s1_and_s2_cat", 236 | etl.meta.multi_equals("cat")(col("s1"), col("s2")) 237 | ) 238 | ``` 239 | 240 | `multi_equals` returns true if `s1` and `s2` are both equal to `"cat"`. 241 | 242 | ### Transformations 243 | 244 | **snakeCaseColumnNames()** 245 | 246 | ```python 247 | etl.meta.snakeCaseColumnNames(source_df) 248 | ``` 249 | 250 | Converts all the column names in a DataFrame to snake_case. It's annoying to write SQL queries when columns aren't snake cased. 251 | 252 | **sort_columns()** 253 | 254 | ```python 255 | etl.meta.sort_columns(source_df, "asc") 256 | ``` 257 | 258 | Sorts the DataFrame columns in alphabetical order. Wide DataFrames are easier to navigate when they're sorted alphabetically. 259 | 260 | ### DataFrame Helpers 261 | 262 | **columnToList()** 263 | 264 | ```python 265 | etl.meta.columnToList(source_df, "name") 266 | ``` 267 | 268 | Converts a column in a DataFrame to a list of values. 269 | 270 | **twoColumns2Dictionary()** 271 | 272 | ```python 273 | etl.meta.twoColumns2Dictionary(source_df, "name", "age") 274 | ``` 275 | 276 | Converts two columns of a DataFrame into a dictionary. In this example, `name` is the key and `age` is the value. 277 | 278 | **toListOfDictionaries()** 279 | 280 | ```python 281 | etl.meta.toListOfDictionaries(source_df) 282 | ``` 283 | Converts an entire DataFrame into a list of dictionaries. 284 | -------------------------------------------------------------------------------- /docs/PysparkLocalSetup.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/PysparkLocalSetup.docx -------------------------------------------------------------------------------- /docs/SETUP.MD: -------------------------------------------------------------------------------- 1 | # Datalake ETL Pipeline 2 | Data transformation simplified for any Data platform. 3 | 4 | `Features:` The package has complete ETL process - 5 | 1. Uses metadata, transformation & data model information to design ETL pipeline 6 | 2. Builds target transformation using both SparkSQL and Spark Dataframes 7 | \- Developer to choose option 8 | 3. Builds source & target Hive DDLs 9 | 4. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides UDF SQL functions. 10 | 5. Supports below fundamental transformations for ETL pipeline - 11 | * Filters on source & target dataframes 12 | * Grouping and Aggregations on source & target dataframes 13 | * Heavily nested queries / dataframes 14 | 6. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth 15 | level of nesting 16 | 7. Has Unit test cases designed on function/method level & measures 17 | source code coverage 18 | 8. Has information about delpoying to higher environments 19 | 9. Has API documentation for customization & enhancement 20 | 21 | `Enhancements:` In progress - 22 | 1. Integrate Audit and logging - Define Error codes, log process failures, Audit progress & runtime information 23 | 24 | ## Setup for Python and Pyspark on Windows & Linux machines locally 25 | 26 | First please install 64 bit JDK 1.8 or 8 for your operating system from Oracle https://www.oracle.com/java/technologies/javase-jdk8-downloads.html 27 | set below below environment variable pointing to home directory of java (Please remember the path mentioned below may be different for you) 28 | ``` 29 | JAVA_HOME=C:\Program Files\Java\jdk1.8.0_231 30 | ``` 31 | 32 | Next, Install PyCharm community edition from https://www.jetbrains.com/pycharm/download/ 33 | 34 | 35 | Make sure you have Python >= 3.0 and Do not use Python 3.8; Recommended is Python 3.7 and PySpark 2.3.4 36 | If you do not have pip tool, get it from https://pypi.org/project/pip/ and execute below command 37 | **If you already have Python & pip installed skip the below step** 38 | ``` 39 | python get-pip.py 40 | ``` 41 | 42 | Once you have Python and pip follow steps below - 43 | 1. Install below libraries for virtual environment (Linux) 44 | ``` 45 | pip install virtualenvwrapper 46 | ``` 47 | Windows - 48 | ``` 49 | pip install virtualenvwrapper-win 50 | ``` 51 | Set up the working directory for virtual environments (Linux) 52 | ``` 53 | export WORKON_HOME=~/Envs 54 | mkdir -p $WORKON_HOME 55 | source /usr/local/bin/virtualenvwrapper.sh 56 | ``` 57 | Windows 58 | ``` 59 | Create directory C:\Users\\Envs 60 | WORKON_HOME=C:\Users\\Envs 61 | ``` 62 | 63 | 2. Setup below **ENVIRONMENT VARIABLES** 64 | 65 | **Unix / Linux / Mac** 66 | 67 | `Please note: Your computer path may vary, use your computer path in below given in example ` 68 | - **PYTHONPATH** - Full path to python executable 69 | ``` 70 | export PYTHONPATH=/usr/python37 71 | ``` 72 | - **PATH** - Update PATH variable add PYTHONPATH 73 | ``` 74 | export PATH=$PATH%:$PYTHONPATH 75 | ``` 76 | - **VIRTUALENV_PYTHON** - To create virtual environments. Path is same as PYTHONPATH 77 | ``` 78 | export VIRTUALENV_PYTHON=$PYTHONPATH 79 | ``` 80 | - **VIRTUALENVWRAPPER_VIRTUALENV** - Wrapper for Virtual Environment tools. Path is Scripts folder under PYTHONPATH 81 | ``` 82 | VIRTUALENVWRAPPER_VIRTUALENV=$PYTHONPATH/Scripts 83 | ``` 84 | **Windows** 85 | 86 | `Please note: Your computer path may vary, use your computer path in below given in example ` 87 | - **PYTHONPATH** - Full path to python.exe 88 | ``` 89 | PYTHONPATH=C:\Program Files\Python37 90 | ``` 91 | - **PATH** - Update PATH variable add PYTHONPATH 92 | ``` 93 | PATH=%PATH%;%PYTHONPATH% 94 | ``` 95 | - **VIRTUALENV_PYTHON** - To create virtual environments. Path is same as PYTHONPATH 96 | ``` 97 | VIRTUALENV_PYTHON=%PYTHONPATH% 98 | ``` 99 | - **VIRTUALENVWRAPPER_VIRTUALENV** - Wrapper for Virtual Environment tools. Path is Scripts folder under PYTHONPATH 100 | ``` 101 | VIRTUALENVWRAPPER_VIRTUALENV=%PYTHONPATH%\Scripts 102 | 103 | ``` 104 | 4. Install Hadoop Binaries for Windows: winutils.exe; Note this is only for Windows. Linux users Download Spark libraries for Linux - 105 | - Download binaries from URL https://github.com/steveloughran/winutils 106 | - Unzip and place in some directory: For Example I'm using C:\winutils-master 107 | - Within extracted folder we have hadoop-2.7.1 --> Example: C:\winutils-master\hadoop-2.7.1 108 | - Declare an Environment variable HADOOP_HOME = C:\winutils-master\hadoop-2.7.1 and update PATH Variable 109 | ``` 110 | HADOOP_HOME=C:\winutils-master\hadoop-2.7.1 111 | PATH=%PATH%;%HADOOP_HOME%\bin 112 | ``` 113 | 5. Check if all the Environment Variables are working - 114 | - Open Command prompt 115 | - Type `python` it must open Python 3.7.x 116 | - Type `winutils`, `hadoop`, `hdfs` it must give help instructions of HDFS 117 | 118 | 3. Create a spark project - 119 | - Create a directory for example `datalake-etl-pipeline` anywhere in your computer; Here I'm using `/home/` and create a new virtual environment using commands below - 120 | ``` 121 | mkvirtualenv -a -p py37 122 | workon py37 123 | cdproject 124 | ``` 125 | `cdproject` command will switch to `datalake-etl-pipeline` folder 126 | 127 | Import required libraries from requirements.txt from `datalake-etl-pipeline` root folder, use command below – 128 | ``` 129 | pip install -r requirements.txt 130 | ``` 131 | To Freeze the existing installed packages use command below - 132 | ``` 133 | pip freeze > requirements.txt 134 | ``` 135 | # ToDo - Yet to add instructions for deploying into higher environments 136 | ## Delpoy to higher environment -------------------------------------------------------------------------------- /docs/images/DataQualityUML.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/DataQualityUML.png -------------------------------------------------------------------------------- /docs/images/XMLParse.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/XMLParse.png -------------------------------------------------------------------------------- /docs/images/dq-task1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/dq-task1.png -------------------------------------------------------------------------------- /docs/images/dq-task2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/dq-task2.png -------------------------------------------------------------------------------- /docs/images/task1_ouput_er.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/task1_ouput_er.png -------------------------------------------------------------------------------- /docs/images/task2_ouput_er.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/docs/images/task2_ouput_er.png -------------------------------------------------------------------------------- /docs/setup.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 16 | 17 | 18 | 19 |
20 | 21 |
22 |

Vitthal Data 24 | Transformation

25 |

Data transformation simplified for any Data platform.

26 |

Features: The package has complete ETL process -

27 |
    28 |
  1. Uses metadata, transformation & data model information to design ETL pipeline
  2. 29 |
  3. Builds target transformation SparkSQL and Spark Dataframes
  4. 30 |
  5. Builds source & target Hive DDLs
  6. 31 |
  7. Validates DataFrames, extends core classes, defines DataFrame transformations, and provides 32 | UDF SQL functions. 33 |
  8. 34 |
  9. Supports below fundamental transformations for ETL pipeline - 35 |
      36 |
    • Filters on source & target dataframes
    • 37 |
    • Grouping and Aggregations on source & target dataframes
    • 38 |
    • Heavily nested queries / dataframes
    • 39 |
    40 |
  10. 41 |
  11. Has complex and heavily nested XML, JSON, Parquet & ORC parser to nth 42 | level of nesting 43 |
  12. 44 |
  13. Has Unit test cases designed on function/method level & measures 45 | source code coverage 46 |
  14. 47 |
  15. Has information about delpoying to higher environments
  16. 48 |
  17. Has API documentation for customization & enhancement
  18. 49 |
50 |

Enhancements: In progress -

51 |
    52 |
  1. Integrate Audit and logging - Define Error codes, log process failures, Audit progress 53 | & runtime information 54 |
  2. 55 |
56 |

Setup

57 |

Make sure you have Python >= 3.0. 58 | If you do not have pip tool, get it from https://pypi.org/project/pip/ and execute below command 59 | If you already have Python & pip installed skip the below step

60 |
python get-pip.py
 61 | 
62 |

Once you have Python and pip follow steps below -

63 |
    64 |
  1. 65 |

    Install below libraries for virtual environment (Linux)

    66 |
    pip install virtualenvwrapper
     67 | 
    68 |

    Windows -

    69 |
    pip install virtualenvwrapper-win
     70 | 
    71 |

    Set up the working directory for virtual environments (Linux)

    72 |
    export WORKON_HOME=~/Envs
     73 | mkdir -p $WORKON_HOME
     74 | source /usr/local/bin/virtualenvwrapper.sh
     75 | 
    76 |

    Windows

    77 |
    set WORKON_HOME=C:\Users\<username>\.Envs
     78 | mkdir -p %WORKON_HOME%
     79 | source /usr/local/bin/virtualenvwrapper.sh
     80 | 
    81 |
  2. 82 |
  3. 83 |

    Setup below ENVIRONMENT VARIABLES

    84 |

    Unix / Linux / Mac

    85 |

    Please note: Your computer path may vary, use your computer 86 | path in below given in example

    87 |
      88 |
    • PYTHONPATH - Full path to python executable 89 |
      export PYTHONPATH=/usr/python37
       90 | 
      91 |
    • 92 |
    • PATH - Update PATH variable add PYTHONPATH 93 |
      export PATH=$PATH%:$PYTHONPATH
       94 | 
      95 |
    • 96 |
    • VIRTUALENV_PYTHON - To create virtual 97 | environments. Path is same as PYTHONPATH 98 |
      export VIRTUALENV_PYTHON=$PYTHONPATH
       99 | 
      100 |
    • 101 |
    • VIRTUALENVWRAPPER_VIRTUALENV - Wrapper for 102 | Virtual Environment tools. Path is Scripts folder under PYTHONPATH 103 |
      VIRTUALENVWRAPPER_VIRTUALENV=$PYTHONPATH/Scripts
      104 | 
      105 |
    • 106 |
    107 |

    Windows

    108 |

    Please note: Your computer path may vary, use your computer 109 | path in below given in example

    110 |
      111 |
    • PYTHONPATH - Full path to python.exe 112 |
      PYTHONPATH=C:\Program Files\Python37
      113 | 
      114 |
    • 115 |
    • PATH - Update PATH variable add PYTHONPATH 116 |
      PATH=%PATH%;%PYTHONPATH%
      117 | 
      118 |
    • 119 |
    • VIRTUALENV_PYTHON - To create virtual 120 | environments. Path is same as PYTHONPATH 121 |
      VIRTUALENV_PYTHON=%PYTHONPATH%
      122 | 
      123 |
    • 124 |
    • VIRTUALENVWRAPPER_VIRTUALENV - Wrapper for 125 | Virtual Environment tools. Path is Scripts folder under PYTHONPATH 126 |
      VIRTUALENVWRAPPER_VIRTUALENV=%PYTHONPATH%\Scripts
      127 | 
      128 |
    • 129 |
    130 |
  4. 131 |
  5. 132 |

    Copy the Vitthal-datalake folder to your home folder /home/<username> and create a new virtual environment using commands 134 | below -

    135 |
    mkvirtualenv -a <path-to-marriot-datalake> -p <full-path-to-python.exe> py37
    136 | workon py37
    137 | cdproject
    138 | 
    139 |

    cdproject command will switch to Vitthal-datalake 140 | folder

    141 |

    Import required libraries from requirements.txt from Vitthal-datalake root folder, 142 | use command below –

    143 |
    pip install -r requirements.txt
    144 | 
    145 |
  6. 146 |
147 |

Delpoy to higher 149 | environment

150 |
151 | 152 | -------------------------------------------------------------------------------- /logs/bash/logs: -------------------------------------------------------------------------------- 1 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - 55729 - Executing create_python_venv.sh on m-c02f6224md6n with arguments 2 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - WARN 55729 - Checking if python3 is installed on machine 3 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - /Library/Frameworks/Python.framework/Versions/3.9/bin/python3 4 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - 55729 - python3 already installed on machine 5 | 2023-04-18T18:06:35+05:30: EXECUTION-LOG - WARN 55729 - Checking if pip tool is installed on machine or attempt to download from internet & install 6 | 2023-04-18T18:06:36+05:30: EXECUTION-LOG - pip 21.1.3 from /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages/pip (python 3.9) 7 | 2023-04-18T18:06:36+05:30: EXECUTION-LOG - 55729 - pip tool already available on machine 8 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Defaulting to user installation because normal site-packages is not writeable 9 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenvwrapper in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (4.8.4) 10 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenv in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (20.4.7) 11 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: virtualenv-clone in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (0.5.4) 12 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: stevedore in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenvwrapper) (3.3.0) 13 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: pbr!=2.1.0,>=2.0.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from stevedore->virtualenvwrapper) (5.6.0) 14 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: six<2,>=1.9.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (1.16.0) 15 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: appdirs<2,>=1.4.3 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (1.4.4) 16 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: distlib<1,>=0.3.1 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (0.3.2) 17 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - Requirement already satisfied: filelock<4,>=3.0.0 in /Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages (from virtualenv->virtualenvwrapper) (3.0.12) 18 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - DEBUG 55729 - Using below environment variables & path for virtual environment test 19 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - DEBUG 55729 - VIRTUALENVWRAPPER_PYTHON=/Library/Frameworks/Python.framework/Versions/3.9/bin/python3 20 | 2023-04-18T18:06:37+05:30: EXECUTION-LOG - DEBUG 55729 - WORKON_HOME=~/python_venvs/ 21 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - created virtual environment CPython3.9.5.final.0-64 in 1454ms 22 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - creator CPython3Posix(dest=/Users/v0m02sj/python_venvs/test, clear=False, no_vcs_ignore=False, global=False) 23 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - seeder FromAppData(download=False, pip=bundle, setuptools=bundle, wheel=bundle, via=copy, app_data_dir=/Users/v0m02sj/Library/Application Support/virtualenv) 24 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - added seed packages: pip==21.1.2, setuptools==57.0.0, wheel==0.36.2 25 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - activators BashActivator,CShellActivator,FishActivator,PowerShellActivator,PythonActivator,XonshActivator 26 | 2023-04-18T18:06:40+05:30: EXECUTION-LOG - Setting project for test to /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test 27 | 2023-04-18T18:06:46+05:30: EXECUTION-LOG - Removing test... 28 | 2023-04-18T18:06:47+05:30: EXECUTION-LOG - 55729 - Process finished successfully, logs can be found at /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/scripts/bin/../logs/log-python-venv-setup-test-2023-04-18-18.06.35.log 29 | -------------------------------------------------------------------------------- /logs/python/log-sample: -------------------------------------------------------------------------------- 1 | 2023-04-19 01:32:44,837 root INFO 63460 init_logging Logging initiated; appending logs to /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/logs/python/log-Hello-World_2023-04-19T01-32-44-810431.log 2 | 2023-04-19 01:32:44,837 root WARNING 63460 main HelloFresh Recipes Data Engineering 3 | 2023-04-19 01:32:44,838 root WARNING 63460 get_or_create_spark_session Creating spark session first time with configs [{'key': 'spark.app.name', 'value': ''}] 4 | 2023-04-19 01:32:44,838 root INFO 63460 read_data_as_spark_dataframe Attempting to read json in spark using configs {'encoding': 'UTF-8'} from location /Users/v0m02sj/PycharmProjects/vim89-data-engineering-test/resources/input 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | psycopg2-binary 2 | psycopg2 3 | botocore 4 | boto3 5 | boto 6 | awscli 7 | dos2unix 8 | lxml 9 | mock 10 | moto 11 | urllib3 12 | tqdm 13 | xmlschema 14 | xlrd 15 | awsglue-local 16 | flask 17 | flask_cors 18 | coverage 19 | isodate 20 | py4j 21 | pyspark 22 | pyspark-stubs 23 | pytz 24 | tabulate -------------------------------------------------------------------------------- /resources/data-quality-reports/recipe-tasks/task1-dq-report.html: -------------------------------------------------------------------------------- 1 | 2 |

Team,

Data Quality check finished successfully for DQ ID = 101, with failures. Check details in 3 | below table of metrics.

4 |

Failed DQ details

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 |
Yarn Application IdDQ IDRule IDRule NameRule typeDescriptionColumns/QueryPass CountFail CountTotal Count
local-16822805494031011011Primary / Natural KeysuniquePrimary / Natural Keys should not have duplicates['name']103931042
local-16822805494031011012NOT NULL fieldsnot nullField should have valid value['name', 'cookTime', 'prepTime']7153271042
local-16822805494031011014"Check for invalid cook & prep timequeryCheck empty or null valuesNone7163261042
55 |

Succeeded DQ details

56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 |
Yarn Application IdDQ IDRule IDRule NameRule typeDescriptionColumns/QueryPass CountFail CountTotal Count
local-16822805494031011013Input files checkqueryCheck If all input files are read for processingNone104201042
82 |

Executed on 2023-04-24 01:39:19,
Thanks 83 | 84 |
-------------------------------------------------------------------------------- /resources/data-quality-reports/recipe-tasks/task2-dq-report.html: -------------------------------------------------------------------------------- 1 | 2 |

Team,

Data Quality check finished successfully for DQ ID = 101. Check details in below table of 3 | metrics.

4 |

Succeeded DQ details

5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 |
Yarn Application IdDQ IDRule IDRule NameRule typeDescriptionColumns/QueryPass CountFail CountTotal Count
local-16822805494031011015Primary / Natural KeysuniquePrimary / Natural Keys should not have duplicates['difficulty']303
local-16822805494031011016NOT NULL fieldsnot nullField should have valid value['difficulty', 'avg_total_cooking_time']303
43 |

Executed on 2023-04-24 01:39:24,
Thanks 44 | 45 |
-------------------------------------------------------------------------------- /resources/data/clinical_trial/data/chunk1.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/clinical_trial/data/chunk1.zip -------------------------------------------------------------------------------- /resources/data/clinical_trial/job_parameters/clinical_trial.json: -------------------------------------------------------------------------------- 1 | { 2 | "clinical_trial_etl": { 3 | "bucket": "dev", 4 | "landing_directory": "data/raw/clinical_trial_landing", 5 | "staging_directory": "data/raw/clinical_trial_staging", 6 | "download_url_prefix_test": "https://github.com/vim89/datalake-etl-pipeline/raw/master/src/resources/clinical_trial/data/chunk", 7 | "download_url_prefix": "https://clinicaltrials.gov/ct2/download_studies?down_chunk=", 8 | "max_chunk_range": 2, 9 | "download_target_filename": "clinical_studies.zip", 10 | "xml_closing_tag": "clinical_study", 11 | "xml_root_tag": "clinical_study", 12 | "xml_row_tag": "clinical_study", 13 | "xml_attribute_tag": "xml_attribute_value", 14 | "xml_attribute_prefix": "xmlattribute_", 15 | "xml_value_tag": "xml_value_tag", 16 | "audit_columns_definition": [ 17 | "reverse(split(input_file_name(), '/'))[0] AS xml_file_name", 18 | "CAST('{ts}' AS TIMESTAMP) AS spark_timestamp" 19 | ], 20 | "audit_columns": [ 21 | "xml_file_name", 22 | "spark_timestamp" 23 | ], 24 | "timestamp_column": [ 25 | "spark_timestamp" 26 | ], 27 | "primary_keys": [ 28 | "id_info.nct_id", 29 | "xml_file_name" 30 | ], 31 | "primary_keys_cascade_to_leaf_level_with_alias": [ 32 | "id_info.nct_id AS pk_nct_id", 33 | "spark_timestamp AS spark_ts" 34 | ], 35 | "order_by_keys": [ 36 | "spark_timestamp" 37 | ], 38 | "hashcode_column": [ 39 | "hashcode" 40 | ], 41 | "target_primary_keys": [ 42 | "nct_id" 43 | ], 44 | "hashcode_encryption_type": "md5", 45 | "cdc_staging_data_write_mode": "append", 46 | "audit_directory": "audit/", 47 | "job_name": "clinical_trial_etl" 48 | } 49 | } -------------------------------------------------------------------------------- /resources/data/clinical_trial/sql/transformations/sponsors.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | nct_id, 3 | agency_class, 4 | lead_or_collaborator, 5 | name, 6 | CAST(last_update_timestamp AS TIMESTAMP) AS last_update_timestamp 7 | FROM 8 | ( 9 | SELECT xmltable_sponsors_lead_sponsor.pk_nct_id AS nct_id, agency_class agency_class, 'lead' AS lead_or_collaborator, 10 | agency AS name, xmltable_sponsors_lead_sponsor.spark_ts AS last_update_timestamp 11 | FROM xmltable_sponsors_lead_sponsor 12 | LEFT JOIN xmltable_sponsors ON 13 | xmltable_sponsors.surrogate_id_xmltable_sponsors = xmltable_sponsors_lead_sponsor.surrogate_id_xmltable_sponsors 14 | AND xmltable_sponsors.pk_nct_id = xmltable_sponsors_lead_sponsor.pk_nct_id 15 | 16 | UNION ALL 17 | 18 | SELECT xmltable_sponsors_collaborator.pk_nct_id AS nct_id, agency_class AS agency_class, 19 | 'collaborator' AS lead_or_collaborator, agency AS name, 20 | xmltable_sponsors_collaborator.spark_ts AS last_update_timestamp 21 | FROM xmltable_sponsors_collaborator 22 | LEFT JOIN xmltable_sponsors ON xmltable_sponsors.surrogate_id_xmltable_sponsors = xmltable_sponsors_collaborator.surrogate_id_xmltable_sponsors 23 | AND xmltable_sponsors.pk_nct_id = xmltable_sponsors_collaborator.pk_nct_id 24 | ) sponsors 25 | -------------------------------------------------------------------------------- /resources/data/config/application_properties.json: -------------------------------------------------------------------------------- 1 | { 2 | "version": 1, 3 | "default_settings": { 4 | "max_parallel_spark_submit_process": 6, 5 | "history_load_interval_in_days": 30 6 | }, 7 | "command_line_args": [ 8 | { 9 | "name": "workflow", 10 | "type": "string", 11 | "default": "None" 12 | }, 13 | { 14 | "name": "startDate", 15 | "type": "string", 16 | "default": "None" 17 | }, 18 | { 19 | "name": "endDate", 20 | "type": "string", 21 | "default": "None" 22 | }, 23 | { 24 | "name": "refreshType", 25 | "type": "string", 26 | "default": "None" 27 | }, 28 | { 29 | "name": "dq_enabled", 30 | "type": "string", 31 | "default": "N" 32 | }, 33 | { 34 | "name": "configFile", 35 | "type": "string", 36 | "default": "/u/users/svcdvchnlperf/adhoc/config-prod.yml" 37 | } 38 | ], 39 | "spark_submit_options_order": { 40 | "spark-submit": { 41 | "priority": 0, 42 | "required": false, 43 | "value": "" 44 | }, 45 | "--master": { 46 | "priority": 1, 47 | "required": true, 48 | "value": "yarn" 49 | }, 50 | "--deploy-mode": { 51 | "priority": 2, 52 | "required": true, 53 | "value": "cluster" 54 | }, 55 | "--executor-cores": { 56 | "priority": 3, 57 | "required": true, 58 | "value": 5 59 | }, 60 | "--executor-memory": { 61 | "priority": 4, 62 | "required": true, 63 | "value": "4g" 64 | }, 65 | "--num-executors": { 66 | "priority": 5, 67 | "required": true, 68 | "value": 20 69 | }, 70 | "--driver-memory": { 71 | "priority": 6, 72 | "required": true, 73 | "value": "6g" 74 | }, 75 | "--name": { 76 | "priority": 7, 77 | "required": false, 78 | "value": "Channel Performance Spark Job" 79 | }, 80 | "--driver-java-options": { 81 | "priority": 8, 82 | "required": true, 83 | "value": "" 84 | }, 85 | "--conf": { 86 | "priority": 9, 87 | "required": true, 88 | "value": "\"spark.executor.memory=4g\"" 89 | }, 90 | "--jars": { 91 | "priority": 10, 92 | "required": true, 93 | "value": "\"/u/users/svcdvchnlperf/adhoc/ScalaSparkArchetypeCore-1.9.3-bundled.jar\"" 94 | }, 95 | "--files": { 96 | "priority": 11, 97 | "required": true, 98 | "value": "\"/u/users/svcdvchnlperf/adhoc/connections/connection.yaml,/u/users/svcdvchnlperf/adhoc/connections/job.yaml\"" 99 | }, 100 | "--class": { 101 | "priority": 12, 102 | "required": true, 103 | "value": "com.walmartlabs.channel.perf.WorkflowController " 104 | }, 105 | "--class_arguments": { 106 | "priority": 13, 107 | "required": false, 108 | "value": { 109 | "workflow": "", 110 | "dq_enabled": "", 111 | "startDate": "", 112 | "endDate": "", 113 | "refreshType": "", 114 | "configFile": "" 115 | } 116 | } 117 | }, 118 | "spark_submit_options_filter": [ 119 | "primary_keys", 120 | "ADHOC_SCHEMA_GCS_BUCKET", 121 | "STG_SCHEMA_GCS_BUCKET", 122 | "APP_SCHEMA_GCS_BUCKET", 123 | "ADHOC_SCHEMA", 124 | "STG_SCHEMA", 125 | "APP_SCHEMA", 126 | "env", 127 | "enableservices", 128 | "runmode", 129 | "srcrcvts", 130 | "userId" 131 | ] 132 | } -------------------------------------------------------------------------------- /resources/data/config/application_properties.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | default_settings: 3 | max_parallel_spark_submit_process: 5 4 | history_load_interval_in_days: 30 -------------------------------------------------------------------------------- /resources/data/config/logging.yaml: -------------------------------------------------------------------------------- 1 | version: 1 2 | objects: 3 | queue: 4 | class: queue.Queue 5 | maxsize: 1000 6 | formatters: 7 | simple: 8 | format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s' 9 | detailed: 10 | format: '%(asctime)s %(name)-15s %(levelname)-8s %(process)-10d %(funcName)-30s %(message)s' 11 | handlers: 12 | console: 13 | class: logging.StreamHandler 14 | level: DEBUG 15 | formatter: detailed 16 | stream: ext://sys.stdout 17 | console_colored: 18 | class: utils.logging_util.ColoredLogger 19 | name: 'Colored' 20 | file: 21 | class: logging.FileHandler 22 | level: DEBUG 23 | encoding: 'utf-8' 24 | formatter: detailed 25 | filename: ../../logs/log-data-pipeline_{timestamp_placeholder}.log 26 | mode: a 27 | queue: 28 | class: utils.logging_util.QueueListenerHandler 29 | level: DEBUG 30 | handlers: 31 | - cfg://handlers.console 32 | - cfg://handlers.file 33 | queue: cfg://objects.queue 34 | loggers: 35 | simpleExample: 36 | level: INFO 37 | handlers: [console, file, queue] 38 | propagate: no 39 | root: 40 | level: DEBUG 41 | handlers: [console, file] -------------------------------------------------------------------------------- /resources/data/product.csv: -------------------------------------------------------------------------------- 1 | id,name ,price 2 | 1 ,Wrist Watch,10 3 | 2 ,Shoes ,8 4 | 3 ,Tshirt ,5 5 | 4 ,Jeans ,7 6 | 5 ,Sunglasses ,7 7 | -------------------------------------------------------------------------------- /resources/data/purchase.csv: -------------------------------------------------------------------------------- 1 | id ,productid,purchasedate,storeid 2 | 100,1 ,10/11/2019 ,1000 3 | 101,3 ,10/12/2019 ,1002 4 | 102,2 , ,1004 5 | 103,1 ,10/14/2019 ,1004 6 | 104,4 ,10/15/2019 ,1003 7 | 105,4 ,10/16/2019 ,1002 8 | -------------------------------------------------------------------------------- /resources/data/recipes/output/task1/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /resources/data/recipes/output/task1/.part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/.part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet.crc -------------------------------------------------------------------------------- /resources/data/recipes/output/task1/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/_SUCCESS -------------------------------------------------------------------------------- /resources/data/recipes/output/task1/part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task1/part-00000-cfbce4d7-507b-499a-9112-aaa2f033da9b-c000.snappy.parquet -------------------------------------------------------------------------------- /resources/data/recipes/output/task2/._SUCCESS.crc: -------------------------------------------------------------------------------- 1 | crc -------------------------------------------------------------------------------- /resources/data/recipes/output/task2/.part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task2/.part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv.crc -------------------------------------------------------------------------------- /resources/data/recipes/output/task2/_SUCCESS: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/resources/data/recipes/output/task2/_SUCCESS -------------------------------------------------------------------------------- /resources/data/recipes/output/task2/part-00000-e56200f2-0e4d-46f1-ab6b-78a44a00c066-c000.csv: -------------------------------------------------------------------------------- 1 | difficulty,avg_total_cooking_time 2 | easy,PT7M5.086705S 3 | hard,PT2H43M37.105263S 4 | medium,PT41M53.288136S 5 | -------------------------------------------------------------------------------- /resources/data/store.csv: -------------------------------------------------------------------------------- 1 | id ,name 2 | 1000,Borivili 3 | 1001,Kandivili 4 | 1002,Andheri 5 | 1003,Bandra 6 | 1004,Dadar 7 | 1005,Byculla 8 | -------------------------------------------------------------------------------- /sbin/common_functions.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #********************************************************# 3 | # Common bash reusable functions 4 | # library_functions.sh 5 | # April 2023 6 | #********************************************************# 7 | 8 | #********************************************************* 9 | # Comprehensive Logging 10 | #********************************************************* 11 | 12 | #-------------------------------------------------------------------- 13 | # Prints a log statement 14 | # Parameter: (message) (level: DEBUG,INFO,ERROR,AUDIT) 15 | # Returns: N/A 16 | #-------------------------------------------------------------------- 17 | log(){ 18 | local msg=$1 19 | local lvl=$2 20 | if [ -z "${lvl}" ]; then 21 | lvl="INFO" 22 | fi 23 | 24 | ## 0=default; 31=red; 33=yellow, 93=light yellow; 34=blue 25 | # shellcheck disable=SC2155 26 | # shellcheck disable=SC2034 27 | local lts=$(date +%FT%T.%3N) 28 | case "${lvl}" in 29 | ("ERROR") 30 | >&2 echo -e "\e[31m${lvl}" $$ "-" "${msg}\e[0m" 31 | ;; 32 | ("WARN") 33 | echo -e "\e[93m${lvl}" $$ "-" "${msg}\e[0m" 34 | ;; 35 | ("AUDIT") 36 | echo -e "\e[34m${lvl}" $$ "-" "${msg}\e[0m" 37 | isCheckRequired=true 38 | ;; 39 | ("DEBUG") 40 | echo -e "\e[33m${lvl}" $$ "-" "${msg}\e[0m" 41 | # shellcheck disable=SC2034 42 | isCheckRequired=true 43 | ;; 44 | (*) echo -e "\e[0m"$$ "-" "${msg}" 45 | return 1 46 | ;; 47 | esac 48 | } 49 | 50 | #-------------------------------------------------------------------- 51 | # Prints an error 52 | # Parameter: Error message 53 | # Returns: N/A 54 | #-------------------------------------------------------------------- 55 | logError(){ 56 | log "$1" "ERROR" 57 | } 58 | 59 | #-------------------------------------------------------------------- 60 | # Prints a warn message 61 | # Parameter: Error message 62 | # Returns: N/A 63 | #-------------------------------------------------------------------- 64 | logWarn(){ 65 | log "$1" "WARN" 66 | } 67 | 68 | #-------------------------------------------------------------------- 69 | # Prints an audit message 70 | # Parameter: Error message 71 | # Returns: N/A 72 | #-------------------------------------------------------------------- 73 | logAudit(){ 74 | log "$1" "AUDIT" 75 | } 76 | 77 | #-------------------------------------------------------------------- 78 | # Prints a debug message 79 | # Parameter: Error message 80 | # Returns: N/A 81 | #-------------------------------------------------------------------- 82 | logDebug(){ 83 | log "$1" "DEBUG" 84 | } 85 | 86 | #-------------------------------------------------------------------- 87 | # Performs cleanup task 88 | # Parameter: N/A 89 | # Returns: N/A 90 | #--------------------------------------------------------------------- 91 | cleanup(){ 92 | log "Process finished successfully, logs can be found at ${LOG_FILE}" 93 | } 94 | 95 | #-------------------------------------------------------------------- 96 | # Called using trap on SIGINT, SIGQUIT, SIGABRT, SIGALRM, SIGTERM 97 | # Parameter: Error message 98 | # Returns: N/A 99 | #--------------------------------------------------------------------- 100 | interrupt(){ 101 | logError "Process got interrupted with exit code $?! Check error logs in ${LOG_FILE}" 102 | exit 1 103 | } 104 | 105 | #-------------------------------------------------------------------- 106 | # Displays a loading indicator for background jobs 107 | # Parameter: Subprocess pid 108 | # Returns: N/A 109 | #--------------------------------------------------------------------- 110 | loadingIndicator(){ 111 | local pid=$1 112 | spin='-\|/' 113 | 114 | local i=0 115 | while kill -0 $pid 2>/dev/null 116 | do 117 | i=$(( (i+1) %4 )) 118 | # shellcheck disable=SC2059 119 | printf "\r${spin:$i:1}" 120 | sleep .1 121 | done 122 | } 123 | -------------------------------------------------------------------------------- /sbin/create_python_venv.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | #********************************************************# 3 | # Python Virtual Environment setup 4 | # create_python_venv.sh 5 | # April 2023 6 | #********************************************************# 7 | 8 | #-------------------------------------------------------------------- 9 | # Prints usage of script, non-zero exit in case of incorrect usage 10 | # Parameter: N/A 11 | # Returns: N/A 12 | #-------------------------------------------------------------------- 13 | scriptUsage() { 14 | logError "Usage: ${SCRIPT_NAME} [./create_python_venv.sh -n VIRTUAL_ENV_NAME]" 15 | logError "Do not use 'sh' shell to run the script; use 'bash' or ./create_python_venv.sh " 1>&2 16 | exit 1; 17 | } 18 | 19 | #----------------------------------------------------------------------- 20 | # Checks if python3 exists otherwise exit with non-zero 21 | # Parameter: N/A 22 | # Returns: 0 if python3 exist else exit with non-zero 23 | #----------------------------------------------------------------------- 24 | python3Exists() { 25 | logWarn "Checking if python3 is installed on machine" 26 | if ! which python3; then 27 | logError "python3 is not installed in the machine, please install python3 as base to create virtual environments on top of base python" 28 | exit 1; 29 | fi 30 | log "python3 already installed on machine" 31 | return 0; 32 | } 33 | 34 | #----------------------------------------------------------------------- 35 | # Checks if pip tool exists otherwise downloads from internet to install 36 | # Exit with non-zero in case of poor or no internet connection 37 | # Parameter: N/A 38 | # Returns: N/A 39 | #----------------------------------------------------------------------- 40 | pipExists() { 41 | logWarn "Checking if pip tool is installed on machine or attempt to download from internet & install" 42 | if ! pip --version; then 43 | if ! curl https://bootstrap.pypa.io/get-pip.py --output get-pip.py; then 44 | logError "Error downloading file from the internet; check your internet connection & proxy settings" 45 | exit 1; 46 | else 47 | log "Downloaded get-pip.py successfully" 48 | if ! python get-pip.py; then 49 | logError "Error installing pip, check logs" 50 | exit 1; 51 | fi 52 | log "pip installed successfully, upgrading" 53 | python3 -m pip install --upgrade pip 54 | return 0 55 | fi 56 | else 57 | log "pip tool already available on machine, upgrading" 58 | python3.9 -m pip install --upgrade pip 59 | return 0 60 | fi 61 | } 62 | 63 | #-------------------------------------------------------------------- 64 | # Installs virtualenvwrapper 65 | # Exit with non-zero in case of any error during installation, 66 | # Parameter: N/A 67 | # Returns: 0 if installation is successful, non-zero exit otherwise 68 | #-------------------------------------------------------------------- 69 | installVEnvWrapper() { 70 | mkdir -p "$HOME/python_venvs/" 71 | if ! pip install virtualenvwrapper; then 72 | logError "Error installing virtualenvwrapper using pip; check logs & check your internet connection & proxy" 73 | exit 1; 74 | fi 75 | return 0 76 | } 77 | 78 | #-------------------------------------------------------------------- 79 | # Creates python virtual environment by given name 80 | # Exit with non-zero in case of any error during creation, 81 | # Parameter: virtualEnvName: name of the virtual environment to create 82 | # Returns: N/A 83 | #-------------------------------------------------------------------- 84 | createVirtualEnv() { 85 | local virtualEnvName 86 | virtualEnvName=$(echo "$1" | xargs) #xargs is to trim 87 | local python3FullPath 88 | python3FullPath=$(which python3) 89 | export VIRTUALENVWRAPPER_PYTHON="${python3FullPath}" 90 | export WORKON_HOME="$HOME/python_venvs/" 91 | export PROJECT_HOME="${HOME_DIRECTORY}/../" 92 | logDebug "Using below environment variables & path for virtual environment ${virtualEnvName}" 93 | logDebug "VIRTUALENVWRAPPER_PYTHON=${VIRTUALENVWRAPPER_PYTHON}" 94 | logDebug "WORKON_HOME=${WORKON_HOME}" 95 | 96 | source virtualenvwrapper.sh 97 | 98 | rmvirtualenv "${virtualEnvName}" 99 | 100 | if ! mkvirtualenv -a "${HOME_DIRECTORY}/../" -p "${python3FullPath}" "${virtualEnvName}";then 101 | logError "Error creating virtual environment ${virtualEnvName}" 102 | exit 1; 103 | fi 104 | } 105 | 106 | #------------------------------------------------------------------------------ 107 | # installs required packages for virtual environment given in requirements.txt 108 | # Exit with non-zero in case of any error during installation, 109 | # Parameter: virtualEnvName: name of the virtual environment to create 110 | # Returns: N/A 111 | #------------------------------------------------------------------------------ 112 | installRequiredPackages() { 113 | local virtualEnvName 114 | virtualEnvName=$(echo "$1" | xargs) #xargs is to trim 115 | workon "${virtualEnvName}" 116 | cdproject 117 | pip install -r requirements.txt 118 | # pip freeze > requirements.txt 119 | python3 -m pip install --upgrade pip 120 | # source activate 121 | } 122 | 123 | #************************************************************************ 124 | # 125 | # MAIN SCRIPTS STARTS HERE 126 | # 127 | #************************************************************************ 128 | 129 | # Execute ./create_python_venv.sh -n hello-fresh-data-engg 130 | 131 | # Read initial variables 132 | HOST_NAME=`hostname` 133 | USER=`whoami` 134 | SCRIPT_NAME=$(basename "$0") 135 | HOME_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 136 | cd "${HOME_DIRECTORY}" || exit # exit in case cd fails; very rare 137 | source "${HOME_DIRECTORY}/common_functions.sh" 138 | 139 | export SETUPTOOLS_USE_DISTUTILS=stdlib 140 | 141 | # trap interrupts 0 SIGHUP SIGINT SIGQUIT SIGABRT SIGALRM SIGTERM 142 | trap interrupt 1 2 3 6 14 15 143 | trap cleanup 0 144 | 145 | while getopts ":n:" arg; do 146 | case "${arg}" in 147 | n) 148 | VENV_NAME=${OPTARG} 149 | ;; 150 | *) 151 | scriptUsage 152 | ;; 153 | esac 154 | done 155 | shift $((OPTIND-1)) 156 | 157 | if [[ -z ${VENV_NAME} ]]; then 158 | logError "Empty virtual environment name" 159 | scriptUsage 160 | fi 161 | 162 | mkdir -p "${HOME_DIRECTORY}/../logs/bash/" 163 | LOG_FILE="${HOME_DIRECTORY}/../logs/bash/log-python-venv-setup-${VENV_NAME}-$(date +%F-%H.%M.%S).log" 164 | # Global log redirect 165 | exec &> >(while read -r line; do printf '%s %s\n' "$(date -Iseconds): EXECUTION-LOG - $line"; done | tee -a "${LOG_FILE}" ) 166 | 167 | log "Executing $SCRIPT_NAME on $HOST_NAME with arguments" 168 | 169 | if python3Exists && pipExists; then 170 | installVEnvWrapper 171 | createVirtualEnv "${VENV_NAME}" 172 | installRequiredPackages "${VENV_NAME}" 173 | fi 174 | 175 | exit 0; 176 | -------------------------------------------------------------------------------- /sbin/execute-tasks-spark-submit.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Read initial variables 4 | HOME_DIRECTORY="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" 5 | cd "${HOME_DIRECTORY}" || exit # exit in case cd fails; very rare 6 | export PYTHONPATH=${PYTHONPATH}:"${HOME_DIRECTORY}/../src/" 7 | export VIRTUALENVWRAPPER_PYTHON="$(which python3)" 8 | export WORKON_HOME="$HOME/python_venvs/" 9 | export PROJECT_HOME="${HOME_DIRECTORY}../" 10 | source virtualenvwrapper.sh 11 | source "$HOME/python_venvs/hello-fresh-data-engg/bin/activate" 12 | 13 | while getopts ":e:c:m:" arg; do 14 | case "${arg}" in 15 | e) 16 | NUM_EXECS=${OPTARG} 17 | ;; 18 | c) 19 | EXEC_CORES=${OPTARG} 20 | ;; 21 | m) 22 | EXEC_MEM=${OPTARG} 23 | ;; 24 | *) 25 | scriptUsage 26 | ;; 27 | esac 28 | done 29 | shift $((OPTIND-1)) 30 | 31 | if [[ -z ${NUM_EXECS} || -z ${EXEC_CORES} || -z ${EXEC_MEM} ]]; then 32 | NUM_EXECS="2" 33 | NUM_CORES="1" 34 | EXEC_MEM="1g" 35 | fi 36 | 37 | FILES="${HOME_DIRECTORY}/../conf/data-quality/rules/production_configs/recipe-task1-dq-rules.json,${HOME_DIRECTORY}/../conf/data-quality/rules/production_configs/recipe-task2-dq-rules.json,${HOME_DIRECTORY}/../conf/spark/log4j.properties" 38 | 39 | spark-submit \ 40 | --master local[*] \ 41 | --name "HelloFresh Data Engineering Recipe tasks" \ 42 | --driver-memory 1g \ 43 | --num-executors "${NUM_EXECS}" \ 44 | --executor-cores "${NUM_CORES}" \ 45 | --executor-memory "${EXEC_MEM}" \ 46 | --conf spark.dynamicAllocation.enabled=false \ 47 | --conf spark.yarn.maxAppAttempts=1 \ 48 | --conf spark.serializer="org.apache.spark.serializer.KryoSerializer" \ 49 | --conf spark.driver.extraJavaOptions="-Dlog4j.configuration=log4j.properties" \ 50 | --files "${FILES}" \ 51 | ../src/com/vitthalmirji/datapipelines/recipe_tasks.py --input-data-dir "${HOME_DIRECTORY}/../resources/data/input" --output-data-dir "${HOME_DIRECTORY}/../resources/data/output" 52 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | import sys 3 | 4 | from setuptools.command.install import install 5 | from setuptools import setup, find_packages 6 | 7 | with open('requirements.txt') as f: 8 | requirements = f.read().splitlines() 9 | 10 | tests_require = ['pytest', 'pytest-cov', 'coverage'] 11 | 12 | with open("docs/ETL_README.md", "r") as f: 13 | long_description = f.read() 14 | 15 | 16 | class ShellInstall(install): 17 | def run(self): 18 | if not sys.platform.startswith("linux"): 19 | print('Your platform {} might not be supported'.format(sys.platform)) 20 | else: 21 | print('Running create_python_venv.sh -n hello-fresh-data-engg') 22 | subprocess.call(['./sbin/create_python_venv.sh', '-n', 'hello-fresh-data-engg']) 23 | install.run(self) 24 | 25 | 26 | setup( 27 | cmdclass={'install': ShellInstall}, 28 | name='datapipelines-essentials', 29 | version='2.0', 30 | author='Vitthal Mirji', 31 | author_email='vitthalmirji@gmail.com', 32 | url='https://vitthalmirji.com', 33 | description='Datalake complex transformations simplified in PySpark', 34 | long_description='Simplified ETL process in Hadoop using Apache Spark. ' 35 | 'SparkSession extensions, DataFrame validation, Column extensions, SQL functions, and DataFrame ' 36 | 'transformations', 37 | long_description_content_type="text/markdown", 38 | install_requires=requirements, 39 | tests_require=tests_require, 40 | extras_require={ 41 | 'test': tests_require, 42 | 'all': requirements + tests_require, 43 | 'docs': ['sphinx'] + tests_require, 44 | 'lint': [] 45 | }, 46 | license="GNU :: GPLv3", 47 | include_package_data=True, 48 | packages=find_packages(where='src', include=['com*']), 49 | package_dir={"": "src"}, 50 | setup_requires=['setuptools'], 51 | classifiers=[ 52 | "Programming Language :: Python :: 3", 53 | "License :: GNU :: GPLv3", 54 | "Operating System :: Linux", 55 | ], 56 | dependency_links=[], 57 | python_requires='>=3.7,<=3.9.5', 58 | keywords=['apachespark', 'spark', 'pyspark', 'etl', 'hadoop', 'bigdata', 'apache-spark', 'python', 'python3', 59 | 'data', 'dataengineering', 'datapipelines'] 60 | ) 61 | -------------------------------------------------------------------------------- /src/com/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/datapipelines/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datapipelines/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/datapipelines/clinical_trial/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datapipelines/clinical_trial/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/datawarehousing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/datawarehousing/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/datawarehousing/change_data_capture.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | 3 | from pyspark.sql import SparkSession, DataFrame, Window 4 | from pyspark.sql.functions import col, row_number 5 | 6 | from com.vitthalmirji.utils.Utilities import is_null_or_empty 7 | 8 | 9 | def append_audit_attributes_to_xml(file, file_contents, xml_closing_tag): 10 | hash_val = hashlib.md5(file_contents.encode('utf-8')).hexdigest() 11 | return str(file_contents).replace(f'', 12 | f'{hash_val}' 14 | f'{str(file)}') 16 | 17 | 18 | def add_row_number_to_dataframe(dataframe: DataFrame, primary_keys, order_by_keys, eliminate_duplicate_records=False, 19 | drop_row_number_column=False): 20 | window = Window.partitionBy( 21 | *list(map(lambda c: col(c), primary_keys))).orderBy( 22 | *list(map(lambda c: col(c).desc(), order_by_keys))) 23 | row_num_col = row_number().over(window=window).alias('row_num') 24 | 25 | if eliminate_duplicate_records and drop_row_number_column: 26 | return dataframe.withColumn(colName='row_num', col=row_num_col).filter('row_num = 1').drop('row_num') 27 | elif eliminate_duplicate_records: 28 | return dataframe.withColumn(colName='row_num', col=row_num_col).filter('row_num = 1') 29 | else: 30 | return dataframe.withColumn(colName='row_num', col=row_num_col) 31 | 32 | 33 | def add_audit_columns(_df: DataFrame) -> DataFrame: 34 | import datetime 35 | ts = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") 36 | df: DataFrame = _df 37 | sel_cols = list(map(lambda x: str(f'`{x}`'), df.schema.names)) 38 | sel_cols.append(f"reverse(split(input_file_name(), '/'))[0] AS spark_file_name") 39 | sel_cols.append(f"CAST('{ts}' AS TIMESTAMP) AS spark_timestamp") 40 | print(sel_cols) 41 | df: DataFrame = df.selectExpr(sel_cols) 42 | return df 43 | 44 | 45 | def identify_new_records(spark: SparkSession, old_dataframe: DataFrame, new_dataframe: DataFrame, 46 | primary_keys=[], order_by_keys=['current_timestamp']) -> DataFrame: 47 | old_df = "old_df" 48 | new_df = "new_df" 49 | 50 | if is_null_or_empty(primary_keys): 51 | print("WARNING - Empty primary keys given: Assuming all fields in the table for Deduplication") 52 | dedup_query = f"SELECT *FROM (SELECT t1.*, row_number() over (order by {','.join(order_by_keys)} desc) as row_num FROM {old_df} t1) WHERE row_num = 1" 53 | elif is_null_or_empty(old_dataframe) and is_null_or_empty( 54 | new_dataframe) and new_dataframe.count() <= 0 and old_dataframe.count() <= 0: 55 | print("Empty Dataframes") 56 | return None 57 | elif not is_null_or_empty(new_dataframe) and new_dataframe.count() > 0 and ( 58 | is_null_or_empty(old_dataframe) or old_dataframe.count() <= 0): 59 | print("Assuming initial load CDC not required") 60 | return new_dataframe 61 | else: 62 | print(f"Before CDC Staging count = {old_dataframe.count()}") 63 | dedup_query = f"SELECT *FROM (SELECT t1.*, row_number() over (partition by {','.join(primary_keys)} order by {','.join(order_by_keys)} desc) as row_num FROM {old_df} t1) WHERE row_num = 1" 64 | old_dataframe.createOrReplaceTempView(old_df) 65 | new_dataframe.createOrReplaceTempView(new_df) 66 | spark.sql(dedup_query).createOrReplaceTempView(old_df) 67 | 68 | join_condition = list(map(lambda x: str(f'{old_df}.{x} = {new_df}.{x}'), primary_keys)) 69 | exclude_condition = list(map(lambda x: str(f'{old_df}.{x} IS NULL'), primary_keys)) 70 | new_pks_query = f"SELECT {new_df}.* FROM {new_df} LEFT JOIN {old_df} ON {' AND '.join(join_condition)} WHERE {' AND '.join(exclude_condition)}" 71 | updates_query = f"SELECT {new_df}.* FROM {new_df} INNER JOIN {old_df} ON {' AND '.join(join_condition)} WHERE {new_df}.hashcode <> {old_df}.hashcode" 72 | print(f"Fetch only New PK records query = {new_pks_query}") 73 | print(f"Fetch updated records query = {updates_query}") 74 | new_pk_records_df: DataFrame = spark.sql(new_pks_query).dropDuplicates() 75 | updates_df: DataFrame = spark.sql(updates_query).dropDuplicates() 76 | 77 | return new_pk_records_df.union(updates_df) 78 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/etl/CColumn.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import * 2 | 3 | from com.vitthalmirji.etl import ETL 4 | 5 | 6 | class CColumn: 7 | def __init__(self, colname, coldatatype, pk, filterclause, udf="", udfargs=[], casttype="", aliasname=""): 8 | self.colname = colname 9 | self.coldatatype = coldatatype 10 | self.pk = pk 11 | self.udf = udf 12 | self.udfargs = udfargs 13 | self.aliasname = aliasname 14 | self.casttype = casttype 15 | self.filterclause = filterclause 16 | self.selectexpression = "" 17 | self.matchmetatype = { 18 | 'tinyint': IntegerType(), 19 | 'smallint': IntegerType(), 20 | 'int': IntegerType(), 21 | 'bigint': LongType(), 22 | 'long': LongType(), 23 | 'float': FloatType(), 24 | 'double': DoubleType(), 25 | 'boolean': BooleanType(), 26 | 'string': StringType(), 27 | 'date': DateType(), 28 | 'timestamp': TimestampType(), 29 | 'binary': BinaryType() 30 | } 31 | 32 | def applyUdf(self): 33 | if ETL.isNullOrEmpty(self.udf) is None and len(self.udfargs) is 0: 34 | # tempcol: pyspark.sql.column.Column = col(str(self.colname)) 35 | # tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname) 36 | self.selectexpression = f"CAST({self.colname} AS {self.casttype}) AS {self.aliasname}," 37 | elif ETL.isNullOrEmpty(self.udf) is not None and len(self.udfargs) is 0: 38 | # tempcol = col(self.colname) 39 | # kwargs = {'field': tempcol} 40 | # udfFunc = getattr(ETL, f"udf{str(self.udf).title()}") 41 | # tempcol = udfFunc(tempcol) 42 | # tempcol = tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname) 43 | self.selectexpression = f"CAST({self.udf}({self.colname}) AS {self.casttype}) AS {self.aliasname}," 44 | elif ETL.isNullOrEmpty(self.udf) is not None and len(self.udfargs) is not 0: 45 | # tempcol = col(self.colname) 46 | # udfFunc = getattr(ETL, f"udf{str(self.udf).title()}") 47 | # tempcol = udfFunc(tempcol) 48 | # tempcol = tempcol.cast(self.matchmetatype[self.casttype]).alias(self.aliasname) 49 | self.selectexpression = f"CAST({self.udf}({self.colname}, {','.join(self.udfargs)}) AS {self.casttype}) AS {self.aliasname}" 50 | else: 51 | self.selectexpression = f"CAST({self.colname} AS {self.casttype}) AS {self.aliasname}," 52 | return self.selectexpression 53 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/etl/ETL.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | import pyspark.sql.functions as f 4 | import pytz 5 | from pyspark.sql import SparkSession 6 | from pyspark.sql.types import * 7 | 8 | lookup = {} 9 | 10 | 11 | # ToDo - Yet to add many potential UDFs 12 | 13 | def registerAllUDF(sc: SparkSession): 14 | sc.udf.register(name='datetimetogmt', f=datetimeToGMT) 15 | sc.udf.register(name='zonedatetimetogmt', f=zoneDatetimeToGMTZone) 16 | sc.udf.register(name='isnullorempty', f=isNullOrEmpty) 17 | sc.udf.register(name='datetimetogmt', f=datetimeToGMT) 18 | sc.udf.register(name='udfnvl', f=udfNvl) 19 | sc.udf.register(name='udflookup', f=udfLookups) 20 | 21 | 22 | def datetimeToGMT(dt, fmt): 23 | local = pytz.timezone("America/Los_Angeles") 24 | # format = "%Y-%m-%d %H:%M:%S" 25 | naive = datetime.datetime.strptime(str(dt).strip(), str(fmt).strip()) 26 | local_dt = local.localize(naive, is_dst=None) 27 | utc_dt = local_dt.astimezone(pytz.utc) 28 | return utc_dt 29 | 30 | 31 | def strSplitSep(s, sep=','): 32 | return str(s).split(str(sep)) 33 | 34 | 35 | def varargsToList(*fields, sep): 36 | return str(sep).join(fields) 37 | 38 | 39 | def zoneDatetimeToGMTZone(dt, fmt, zone): 40 | local = pytz.timezone(str(zone).strip()) 41 | # format = "%Y-%m-%d %H:%M:%S" 42 | naive = datetime.datetime.strptime(str(dt).strip(), str(fmt).strip()) 43 | local_dt = local.localize(naive, is_dst=None) 44 | utc_dt = local_dt.astimezone(pytz.utc) 45 | return utc_dt 46 | 47 | 48 | @f.udf(returnType=StringType()) 49 | def udfNvl(field): 50 | if isNullOrEmpty(field) is None: 51 | return "-" 52 | else: 53 | return field 54 | 55 | 56 | @f.udf(returnType=StringType()) 57 | def udfLookups(clname, s): 58 | finallookupvalue = [] 59 | if s is None: 60 | return "" 61 | else: 62 | codes = str(s).split(sep=';') 63 | for cd in codes: 64 | if f"{clname} {cd}" in lookup.keys(): 65 | finallookupvalue.append(lookup[f"{clname} {cd}"]) 66 | else: 67 | finallookupvalue.append(cd) 68 | 69 | return ';'.join(finallookupvalue) 70 | 71 | 72 | def squared_udf(s): 73 | if s is None: 74 | return None 75 | return s * s 76 | 77 | 78 | def nullString(s): 79 | return s is None or str(s).strip().__eq__("") is None 80 | 81 | 82 | def isNullOrEmpty(s): 83 | if s is None: 84 | return None 85 | if str(s).strip() is None or str(s).strip().__eq__(""): 86 | return None 87 | return str(s).strip() 88 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/etl/ETLTransform.py: -------------------------------------------------------------------------------- 1 | import functools 2 | 3 | import pyspark.sql.functions as SparkSQLFunctions 4 | from pyspark.sql import DataFrame, SparkSession 5 | 6 | from com.vitthalmirji.etl import ETL 7 | from com.vitthalmirji.etl.ITable import SourceTable, TargetTable, matchEqualityOperator 8 | from com.vitthalmirji.etl.meta import MetaModel 9 | from com.vitthalmirji.etl.meta.MetaModel import MetaResult 10 | 11 | 12 | # ToDo - Source & Target group aggregations 13 | 14 | class Transform: 15 | def __init__(self, targettable, model: MetaModel, sc: SparkSession): 16 | self.model = model 17 | self.spark = sc 18 | self.sourcetables: list[SourceTable] = [] 19 | self.targettable = targettable 20 | self.transformquery = "" 21 | self.joindict = {} 22 | self.sourcetablesdf: list[DataFrame] = [] 23 | self.targetdf: DataFrame = None 24 | self.targetcolumnslist = [] 25 | self.joincolumns = None 26 | self.jointype = None 27 | 28 | def genericDfOperation(self, operationFunc): 29 | return operationFunc(self) 30 | 31 | DataFrame.genericDfOperation = genericDfOperation 32 | 33 | def filterSourceTable(self, srctbl): 34 | srctbls = filter(lambda tbl: tbl.tablename == srctbl, self.sourcetables) 35 | return list(srctbls) 36 | 37 | def joinDataframes(self, dict1, dict2): 38 | targetdf: DataFrame = dict1['df'].join(dict2['df'], on=dict2['condition'], how=dict2['jointype']) 39 | return {'df': targetdf} 40 | 41 | def mapAggregationFunction(self, fieldname, functionname): 42 | if str(functionname).__eq__('min'): 43 | return SparkSQLFunctions.min(col=SparkSQLFunctions.col(fieldname)) 44 | elif str(functionname).__eq__('max'): 45 | return SparkSQLFunctions.max(col=SparkSQLFunctions.col(fieldname)) 46 | elif str(functionname).__eq__('count'): 47 | return SparkSQLFunctions.count(col=SparkSQLFunctions.col(fieldname)) 48 | elif str(functionname).__eq__('sum'): 49 | return SparkSQLFunctions.sum(col=SparkSQLFunctions.col(fieldname)) 50 | elif str(functionname).__eq__('avg'): 51 | return SparkSQLFunctions.avg(col=SparkSQLFunctions.col(fieldname)) 52 | 53 | def applyJoin(self): 54 | self.query, self.joindict = self.model.joinSQL(self.model.datamodel, 'purchase', 'product', 'store') 55 | 56 | joinlist = [] 57 | for k in self.joindict.keys(): 58 | srctabledf: DataFrame = self.filterSourceTable(k)[0].targetdf 59 | self.joindict[k].update({'df': srctabledf}) 60 | joinlist.append(self.joindict[k]) 61 | 62 | self.targetdf: DataFrame = functools.reduce(self.joinDataframes, joinlist)['df'] 63 | 64 | def applyFilters(self): 65 | tblinfo: MetaResult = self.model.filterMetaResultBySourceTable(self.sourcetables[0].tablename) 66 | targettable: TargetTable = TargetTable(sourcesystem=tblinfo.src_system, tablename=tblinfo.target_table, pk=[], 67 | database=tblinfo.target_database, 68 | filetype=tblinfo.target_filetype, filepath=tblinfo.target_file_path, 69 | modeltableorder=tblinfo.src_table_order) 70 | 71 | for metares in self.model.metaresultlist: 72 | filterexpr = matchEqualityOperator(expression=metares.src_col_filter) 73 | if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'): 74 | self.filterclause = f"{self.filterclause} {metares.target_col}{filterexpr}".strip() 75 | 76 | self.filterclause = self.filterclause.strip() 77 | 78 | if self.filterclause is None: 79 | self.filterclause = "" 80 | 81 | targettable.df: DataFrame = self.targetdf.filter(self.filterclause) 82 | 83 | def applyGroupAndAggregation(self): 84 | selectlist = [] 85 | aggregations = {} 86 | for metares in self.model.filterMetaResultByTargetTable(self.targettable): 87 | if ETL.isNullOrEmpty(metares.target_col_aggregator) is not None: 88 | selectlist.append(metares.target_col) 89 | else: 90 | aggregations.update({ 91 | metares.target_col: { 92 | 'function': metares.target_col_aggregator, 93 | 'filter': metares.target_col_aggregator_filter 94 | } 95 | }) 96 | 97 | self.targetdf: DataFrame = self.targetdf.groupby(*selectlist).agg(SparkSQLFunctions.min) 98 | 99 | def transform(self): 100 | # Get Unique source table names for Transformation 101 | srctables = set() 102 | for metares in self.model.metaresultlist: 103 | srctables.add(metares.src_table) 104 | 105 | # For each source table create SourceTable object and assign transform columns 106 | for srctable in srctables: 107 | tablemetaresult = self.model.filterMetaResultBySourceTable(srctbl=srctable) 108 | tblinfo: MetaResult = tablemetaresult[0] 109 | 110 | fklist = [] 111 | 112 | for item in self.model.datamodel.keys(): 113 | if self.model.datamodel[item]['fk'] is not None or self.model.datamodel[item]['fk'] is {}: 114 | if srctable in self.model.datamodel[item]['fk'].keys(): 115 | fklist.extend(self.model.datamodel[item]['fk'][srctable]['fk_pk']) 116 | 117 | sourcetable: SourceTable = SourceTable(sourcesystem=tblinfo.src_system, tablename=tblinfo.src_table, 118 | pk=self.model.datamodel[tblinfo.src_table]['pk'], 119 | fk=fklist, 120 | database=tblinfo.src_database, filepath=tblinfo.src_file_path, 121 | filetype=tblinfo.src_filetype, 122 | modeltableorder=tblinfo.src_table_order) 123 | self.sourcetables.append(sourcetable) 124 | for tbl in tablemetaresult: 125 | sourcetable.addColumn(name=tbl.src_col, type=tbl.src_col_datatype, 126 | pk=(True, False)[tbl.src_key_constraints.__eq__('pk')], 127 | udf=tbl.udf, udfargs=tbl.udfarguments, casttype=tbl.target_col_datatype, 128 | aliasname=tbl.target_col, filterclause=tbl.src_col_filter, fk={}) 129 | 130 | # Read file as dataframe 131 | sourcetable.readFileFromSource(spark=self.spark) 132 | 133 | ETL.registerAllUDF(sc=self.spark) 134 | for sourcetable in self.sourcetables: 135 | sourcetable.applyTransform() 136 | 137 | self.applyJoin() 138 | 139 | self.applyFilters() 140 | 141 | self.applyGroupAggregation() 142 | 143 | self.targetdf.show() 144 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/etl/ITable.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import ABC 3 | 4 | from pyspark.sql import DataFrame, SparkSession 5 | 6 | from com.vitthalmirji.etl import ETL 7 | from com.vitthalmirji.etl.CColumn import CColumn 8 | from com.vitthalmirji.imports.HdfsImport import HdfsImport 9 | 10 | 11 | class ITable: 12 | sourcesystem: str 13 | tablename: str 14 | columnlist: [] 15 | pk: [] 16 | fk: [] 17 | database: str 18 | filepath: str 19 | modeltableorder: int 20 | 21 | @abc.abstractmethod 22 | def getColumnList(self): [] 23 | 24 | @abc.abstractmethod 25 | def getPkList(self): [] 26 | 27 | @abc.abstractmethod 28 | def getFkList(self): [] 29 | 30 | @abc.abstractmethod 31 | def getPath(self): str 32 | 33 | @abc.abstractmethod 34 | def getDatabaseName(self): str 35 | 36 | @abc.abstractmethod 37 | def readFileFromSource(self, park: SparkSession, opt={}, tbl=""): DataFrame 38 | 39 | 40 | def matchEqualityOperator(expression): 41 | expr = str(expression) 42 | if expr is None or expr.__eq__("None"): 43 | expr = str("") 44 | elif expr.find('eq(') != -1: 45 | expr = expr.replace('eq(', '=').replace(')', '') 46 | if expr.find('gt') != -1: 47 | expr = expr.replace('gt(', '>').replace(')', '') 48 | elif expr.find('lt') != -1: 49 | expr = expr.replace('lt(', '<').replace(')', '') 50 | elif expr.find('lte') != -1: 51 | expr = expr.replace('lte(', '<=').replace(')', '') 52 | elif expr.find('gte') != -1: 53 | expr = expr.replace('gte(', '>=').replace(')', '') 54 | elif expr.find('notin') != -1: 55 | expr = expr.replace('notin', 'NOT IN') 56 | elif expr.find('in') != -1: 57 | expr = expr.replace('in', 'IN') 58 | elif expr.find('ne') != -1: 59 | expr = expr.replace('ne(', '<>').replace(')', '') 60 | else: 61 | expr = expr.strip() 62 | 63 | if expr is None or expr.__eq__('None'): 64 | expr = "" 65 | 66 | return expr 67 | 68 | 69 | class SourceTable(ITable): 70 | def __init__(self, sourcesystem, tablename, pk, fk, database, filetype, filepath, modeltableorder): 71 | self.tablename = tablename 72 | self.pk = pk 73 | self.fk = fk 74 | self.database = database 75 | self.sourcesystem = sourcesystem 76 | self.filepath = filepath 77 | self.filetype = filetype 78 | self.modeltableorder = modeltableorder 79 | self.df: DataFrame = None 80 | self.columnlist: list[CColumn] = [] 81 | self.filterclause = "" 82 | 83 | def getFilterCondition(self): 84 | return self.filterclause 85 | 86 | def addColumn(self, name, type, pk, udf, udfargs, casttype, aliasname, filterclause, fk={}) -> None: 87 | col = CColumn(colname=name, coldatatype=type, pk=pk, udf=udf, udfargs=udfargs, casttype=casttype, 88 | aliasname=aliasname, filterclause=filterclause) 89 | 90 | filterexpr = matchEqualityOperator(expression=filterclause) 91 | if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'): 92 | self.filterclause = f"{self.filterclause} {name}{filterexpr}".strip() 93 | 94 | self.filterclause = self.filterclause.strip() 95 | 96 | if self.filterclause is None: 97 | self.filterclause = "" 98 | 99 | self.columnlist.append(col) 100 | 101 | def getPkList(self) -> []: 102 | return self.pk 103 | 104 | def getFkList(self) -> []: 105 | return self.fk 106 | 107 | def getColumnList(self) -> []: 108 | return self.columnlist 109 | 110 | def getDatabaseName(self) -> str: 111 | return self.database 112 | 113 | def getPath(self) -> str: 114 | return self.filepath 115 | 116 | def readFileFromSource(self, spark: SparkSession, opt={}, tbl="") -> DataFrame: 117 | importModule = HdfsImport(spark=spark) 118 | sourcedf = importModule.readFromSource(location=self.filepath, filetype=self.filetype, opt=opt) 119 | self.df: DataFrame = sourcedf 120 | return sourcedf 121 | 122 | def getDf(self) -> DataFrame: 123 | return self.df 124 | 125 | def applyTransform(self): 126 | selectexpression = "" 127 | for _srccol in self.columnlist: 128 | srccol: CColumn = _srccol 129 | selectexpression = f"{selectexpression}{srccol.applyUdf()}" 130 | 131 | selectexpression = f"{selectexpression}--End" 132 | selectexpression = selectexpression.strip(',--End') 133 | 134 | for p in self.pk: 135 | selectexpression = f"{selectexpression}, {p} AS {self.tablename}{p}" 136 | 137 | for f in self.fk: 138 | selectexpression = f"{selectexpression}, {f}" 139 | 140 | if ETL.isNullOrEmpty(self.filterclause) is not None: 141 | self.targetdf: DataFrame = self.df.filter(self.filterclause).selectExpr(selectexpression) 142 | else: 143 | self.targetdf: DataFrame = self.df.selectExpr(selectexpression) 144 | 145 | return self.targetdf 146 | 147 | 148 | class TargetTable(ITable, ABC): 149 | def __init__(self, sourcesystem, tablename, pk, database, filetype, filepath, modeltableorder): 150 | self.tablename = tablename 151 | self.pk = pk 152 | self.database = database 153 | self.sourcesystem = sourcesystem 154 | self.filepath = filepath 155 | self.filetype = filetype 156 | self.modeltableorder = modeltableorder 157 | self.df: DataFrame = None 158 | self.columnlist: list[CColumn] = [] 159 | self.sourcetableslist = list[SourceTable] = [] 160 | self.filterclause = "" 161 | self.aggregationcolumns = [] 162 | self.aggregationfilter = [] 163 | 164 | def getPkList(self) -> []: 165 | return self.pk 166 | 167 | def getFkList(self) -> []: 168 | return self.fk 169 | 170 | def getColumnList(self) -> []: 171 | return self.columnlist 172 | 173 | def getDatabaseName(self) -> str: 174 | return self.database 175 | 176 | def getPath(self) -> str: 177 | return self.filepath 178 | 179 | def addColumn(self, name, type, pk, filterclause) -> None: 180 | col = CColumn(colname=name, coldatatype=type, pk=pk, filterclause=filterclause) 181 | 182 | filterexpr = matchEqualityOperator(expression=filterclause) 183 | if filterexpr is not None and not filterexpr.__eq__("") and not filterexpr.lower().__eq__('none'): 184 | self.filterclause = f"{self.filterclause} {name}{filterexpr}".strip() 185 | 186 | self.filterclause = self.filterclause.strip() 187 | 188 | if self.filterclause is None: 189 | self.filterclause = "" 190 | 191 | self.columnlist.append(col) 192 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/etl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/etl/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/etl/meta/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/etl/meta/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/imports/HdfsImport.py: -------------------------------------------------------------------------------- 1 | import abc 2 | from abc import ABC 3 | 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.dataframe import DataFrame 6 | 7 | from com.vitthalmirji.etl import ETL 8 | 9 | 10 | class IImport: 11 | spark: SparkSession 12 | system: str 13 | table: str 14 | 15 | @abc.abstractmethod 16 | def readFromSource(self, location, filetype, opt={}, tbl=""): DataFrame 17 | 18 | @abc.abstractmethod 19 | def cleanup(self, location): None 20 | 21 | 22 | class HdfsImport(IImport, ABC): 23 | def __init__(self, spark: SparkSession): 24 | self.spark = spark 25 | 26 | def readFromSource(self, location, filetype, opt={}, tbl="") -> DataFrame: 27 | try: 28 | if str(filetype).lower().__eq__('tbl'): 29 | if ETL.isNullOrEmpty(tbl) is not None: 30 | try: 31 | _ = self.spark.read.table(tbl) 32 | except Exception as ex: 33 | print(f"Error reading table {tbl}") 34 | else: 35 | print(f"Invalid table {tbl} -Table do not exist in SQL Context: ") 36 | elif str(filetype).lower().__eq__('text'): 37 | return self.spark.read.text(paths=location, wholetext=True).toDF('line') 38 | elif str(filetype).lower().__eq__('csv'): 39 | return self.spark.read.options(header=True, inferSchema=True).csv(path=location) 40 | elif str(filetype).lower().__eq__('xml'): 41 | print(opt) 42 | return self.spark.read.format('com.databricks.spark.xml').options(rowTag='HotelDescriptiveContent', 43 | rootTag='HotelDescriptiveContents', 44 | valueTag='xmlvaluetag', 45 | attributePrefix="@").load( 46 | path=location) 47 | elif str(filetype).lower().__eq__('json'): 48 | return self.spark.read.options(options=opt).json(path=location) 49 | elif str(filetype).lower().__eq__('orc'): 50 | return self.spark.read.options(options=opt).orc(location) 51 | elif str(filetype).lower().__eq__('parquet'): 52 | return self.spark.read.options(options=opt).parquet(location) 53 | else: 54 | raise "Invalid filetype: " + filetype 55 | except Exception as ex: 56 | print("Error reading file in Spark of filetype " + filetype + " Error details: " + str(ex)) 57 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/imports/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/imports/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/kafka/Logger.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | 4 | from kafka import KafkaProducer 5 | 6 | logger_names = [] 7 | 8 | 9 | class Logger(logging.Handler): 10 | 11 | def __init__(self, Jobname, hostlist, topic, tls=None): 12 | self.__level = "INFO" 13 | self.__formatter = "%(asctime)s %(levelname)-8s %(message)s" 14 | self.__local_file_path = Jobname + ".log" 15 | logging.Handler.__init__(self) 16 | self.producer = KafkaProducer(bootstrap_servers=hostlist, 17 | value_serializer=lambda v: json.dumps(v).encode('utf-8'), 18 | linger_ms=10) 19 | self.topic = topic 20 | 21 | def get(self, name): 22 | global logger_names 23 | logger = logging.getLogger(name) 24 | logger.setLevel(self.__level) 25 | if name not in logger_names: 26 | handler = logging.FileHandler(self.__local_file_path) 27 | formatter = logging.Formatter(self.__formatter) 28 | handler.setFormatter(formatter) 29 | handler.setLevel(self.__level) 30 | logger.addHandler(handler) 31 | logger_names.append(name) 32 | return logger 33 | 34 | # Write log to kafka topic 35 | def emit(self, record): 36 | # Avoid infinite loop by checking if Kafka's logs are looping in messages 37 | if 'kafka.' in record.name: 38 | return 39 | try: 40 | # apply the logger formatter 41 | msg = self.format(record) 42 | self.producer.send(self.topic, {'message': msg}) 43 | self.flush(timeout=1.0) 44 | except Exception: 45 | logging.Handler.handleError(self, record) 46 | 47 | def flush(self, timeout=None): 48 | # Flush all the objects 49 | self.producer.flush(timeout=timeout) 50 | 51 | def close(self): 52 | # Close producer and clean up 53 | self.acquire() 54 | try: 55 | if self.producer: 56 | self.producer.close() 57 | logging.Handler.close(self) 58 | finally: 59 | self.release() 60 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/kafka/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/kafka/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/main.py: -------------------------------------------------------------------------------- 1 | # Main file 2 | 3 | # Create objects & invoke methods required for your ETL process 4 | import datetime 5 | import logging 6 | 7 | from utils.Utilities import init_logging 8 | 9 | if __name__ == '__main__': 10 | init_logging(log_time_stamp=datetime.datetime.now().isoformat().__str__()) 11 | logging.debug("Hello") 12 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/mapper/Mapper.py: -------------------------------------------------------------------------------- 1 | import abc 2 | 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.dataframe import DataFrame 5 | from pyspark.sql.types import DataType, StructType, ArrayType, StructField, LongType 6 | 7 | 8 | class IMapper: 9 | @abc.abstractmethod 10 | def getDataframeSchema(self, df: DataFrame): DataFrame 11 | 12 | def createDDL(self, df: DataFrame, database, table, location): str 13 | 14 | 15 | def generate_deterministic_surrogate_key(spark: SparkSession, df: DataFrame, keyOffset=1, colName="keyName"): 16 | try: 17 | new_schema = StructType([StructField(colName, LongType(), True)] + df.schema.fields) 18 | new_rdd = df.rdd.zipWithIndex().map(lambda row: ([row[1] + keyOffset] + list(row[0]))) 19 | max_key = new_rdd.map(lambda x: x[0]).max() 20 | final_df = spark.createDataFrame(new_rdd, new_schema) 21 | return final_df, max_key, "success", "errorNotFound" 22 | except Exception as e: 23 | return df, keyOffset, "error", e 24 | 25 | 26 | class ComplexDataMapper(IMapper): 27 | outerselects = [] 28 | 29 | def __init__(self, sc): 30 | self.spark: SparkSession = sc 31 | 32 | def getDataframeSchema(self, df: DataFrame) -> StructType: 33 | return df.schema 34 | 35 | def createDDL(self, df: DataFrame, database, table, location): 36 | newline = '\n' 37 | ddl = str("") 38 | if database.__eq__(""): 39 | ddl = str(f"CREATE EXTERNAL TABLE {table} {newline}({newline}") 40 | else: 41 | ddl = str(f"CREATE EXTERNAL TABLE {database}.{table} {newline}({newline}") 42 | 43 | bigarraytypes: list[(str, str)] = None 44 | 45 | for field in df.schema.fields: 46 | if len(field.dataType.simpleString()) <= 100000: 47 | ddl = ddl + str(f"`{field.name}` {field.dataType.simpleString()},{newline}") 48 | else: 49 | print(f"Found big tag {field.name} skipping.. as the type definition exceeds more than value set in " 50 | f"Ambari > Hive > Configs > Advanced > Custom hive-site hive.metastore.max.typename.length=100000") 51 | # bigarraytypes += list[(field.name, field.dataType.sql)] 52 | 53 | ddl = ddl.rstrip(',\n') 54 | 55 | ddl += f"{newline}) {newline}" \ 56 | f"STORED AS PARQUET {newline}" \ 57 | f"LOCATION {location};{newline};" 58 | 59 | return ddl 60 | 61 | def createViews(self, df: DataFrame, root_table_name='xmltable', 62 | columns_cascade_to_leaf_level_with_alias=None) -> {}: 63 | views = {} 64 | views, xpaths = self.complexTypeIterator(viewname="", viewpath="", database="", 65 | table=root_table_name, level=0, 66 | dtype=df.schema, acc={}, root_table_name=root_table_name, 67 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level_with_alias) 68 | return views, xpaths 69 | 70 | def handleStructType(self, viewname, viewpath, database, table, level, dtype, columns_cascade_to_leaf_level, acc={}, 71 | xpath=[]) -> {}: 72 | structtype: StructType = dtype 73 | selcols = [] 74 | if columns_cascade_to_leaf_level is not None and len(columns_cascade_to_leaf_level) > 0: 75 | cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}" 76 | else: 77 | cascade_columns = "" 78 | if viewname is None or str(viewname).__eq__(""): 79 | viewname = table 80 | for field in structtype.fields: 81 | if str(field.dataType).lower().startswith("struct"): 82 | selcols.append(f"t{level}.`{field.name}`") 83 | viewname = field.name 84 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}" 85 | query = f"SELECT t{level}.`{field.name}`.*, t{level}.surrogate_id_{table}, " \ 86 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \ 87 | f"{cascade_columns} " \ 88 | f"FROM {table} t{level} " 89 | keynm = f"{table.replace('.', '_')}_{viewname}" 90 | acc.update({keynm: query}) 91 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm, level=level, 92 | dtype=field.dataType, acc=acc, 93 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level) 94 | elif str(field.dataType).lower().startswith("array"): 95 | selcols.append(f"t{level}.`{field.name}`") 96 | arrtype: ArrayType = field.dataType 97 | if str(arrtype.elementType).lower().startswith("struct"): 98 | viewname = field.name 99 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}" 100 | query = f"SELECT v{level}.*, t{level}.surrogate_id_{table}, " \ 101 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \ 102 | f"{cascade_columns} " \ 103 | f"FROM {table} t{level} LATERAL VIEW INLINE(t{level}.`{field.name}`) v{level}" 104 | keynm = f"{table.replace('.', '_')}_{viewname}" 105 | acc.update({keynm: query}) 106 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm, 107 | level=level + 1, dtype=arrtype.elementType, acc=acc, 108 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level) 109 | else: 110 | viewname = field.name 111 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}" 112 | query = f"SELECT v{level}.col AS {viewname}, " \ 113 | f"t{level}.surrogate_id_{table}, " \ 114 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \ 115 | f"{cascade_columns} " \ 116 | f"FROM {table} t{level} " \ 117 | f"LATERAL VIEW EXPLODE(t{level}.`{field.name}`) v{level}" 118 | keynm = f"{table.replace('.', '_')}_{viewname}" 119 | acc.update({keynm: query}) 120 | xpath.append(f'{viewpath.replace(".", "/")}/{field.name}') 121 | else: 122 | xpath.append(f'{viewpath.replace(".", "/")}/{field.name}') 123 | selcols.append(f"t{level}.`{field.name}`") 124 | 125 | if len(selcols) > 0: 126 | query = f"SELECT {','.join(selcols)}, " \ 127 | f"monotonically_increasing_id() AS surrogate_id_{table} " \ 128 | f"{cascade_columns} " \ 129 | f"FROM {table} t{level}" 130 | keynm = f"{table.replace('.', '_')}_{viewname}_outer" 131 | # acc.update({keynm: query}) 132 | return acc 133 | 134 | def handleArrayType(self, viewname, viewpath, database, table, level, dtype: ArrayType, 135 | columns_cascade_to_leaf_level, acc={}, xpath=[]) -> {}: 136 | if columns_cascade_to_leaf_level is not None and len(columns_cascade_to_leaf_level) > 0: 137 | cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}" 138 | else: 139 | cascade_columns = "" 140 | if str(dtype.elementType).lower().startswith("struct"): 141 | arr_struct_type: StructType = dtype.elementType 142 | viewname = arr_struct_type.name 143 | viewpath = f"{table.replace('.', '_')}.{viewname.replace('.', '_')}" 144 | query = f"SELECT v{level}.*, t{level}.surrogate_id_{table}," \ 145 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \ 146 | f"{cascade_columns} " \ 147 | f"FROM {table} t{level} " \ 148 | f"LATERAL VIEW INLINE(t{level}.`{arr_struct_type.name}`) v{level}" 149 | keynm = f"{table.replace('.', '_')}_{viewname}" 150 | acc.update({keynm: query}) 151 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=keynm, 152 | level=level + 1, dtype=arr_struct_type, acc=acc, 153 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level) 154 | else: 155 | viewname = viewname 156 | viewpath = viewpath 157 | query = f"SELECT v{level}.col AS {viewname}, t{level}.surrogate_id_{table}, " \ 158 | f"monotonically_increasing_id() AS surrogate_id_{viewpath.replace('.', '_')} " \ 159 | f"{cascade_columns} " \ 160 | f"FROM {table} t{level} " \ 161 | f"LATERAL VIEW EXPLODE(t{level}.`{viewname}`) v{level}" 162 | keynm = f"{table.replace('.', '_')}_{viewname}" 163 | acc.update({keynm: query}) 164 | xpath.append(f'{viewpath.replace(".", "/")}/{viewname}') 165 | return acc, xpath 166 | 167 | def complexTypeIterator(self, viewname, viewpath, database, table, level, 168 | dtype: DataType, root_table_name, columns_cascade_to_leaf_level, acc={}, xpath=[]) -> {}: 169 | if viewname is None or str(viewname).__eq__(""): 170 | keynm = f"{table.replace('.', '_')}" 171 | if columns_cascade_to_leaf_level is not None: 172 | cascade_columns = f", {','.join(list(map(lambda c: f't{level}.{c}', columns_cascade_to_leaf_level)))}" 173 | else: 174 | cascade_columns = "" 175 | query = f"SELECT t{level}.*, " \ 176 | f"monotonically_increasing_id() AS surrogate_id_{table} " \ 177 | f"{cascade_columns} " \ 178 | f"FROM {root_table_name} t{level}" 179 | acc.update({keynm: query}) 180 | table = keynm 181 | 182 | columns_cascade_to_leaf_level = list( 183 | map(lambda c: f"{c.split('AS')[-1].strip()} AS {c.split('AS')[-1].strip()}", 184 | columns_cascade_to_leaf_level)) 185 | 186 | if dtype.typeName().lower().__eq__("struct"): 187 | self.handleStructType(viewname=viewname, viewpath=viewpath, database=database, table=table, level=level, 188 | dtype=dtype, acc=acc, xpath=[], 189 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level) 190 | elif dtype.typeName().lower().__eq__("array"): 191 | self.handleArrayType(viewname=viewname, viewpath=viewpath, database=database, table=table, level=level, 192 | dtype=dtype, acc=acc, xpath=[], 193 | columns_cascade_to_leaf_level=columns_cascade_to_leaf_level) 194 | else: 195 | xpath.append(f'{viewpath.replace(".", "/")}/{viewname}') 196 | return acc, xpath 197 | return acc, xpath 198 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/mapper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/mapper/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/objects/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/objects/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/objects/enums/Environments.py: -------------------------------------------------------------------------------- 1 | import abc 2 | import socket 3 | 4 | from pyspark.sql import SparkSession 5 | 6 | 7 | class Environment: 8 | def __init__(self, name, nameservice, zookeeperquorum, historyserver): 9 | self.name = name 10 | self.nameservice = nameservice 11 | self.zookeeperquorum = zookeeperquorum 12 | self.historyserver = historyserver 13 | 14 | 15 | class IEnvironment: 16 | @abc.abstractmethod 17 | def getEnvironment(self, sc: SparkSession): Environment 18 | 19 | def getEnvironmentByServer(self): Environment 20 | 21 | 22 | class Environments(IEnvironment): 23 | def __init__(self): 24 | self.local = Environment("local", "", "localhost", "localhost:18081") 25 | self.dev = Environment("dev", "", "localhost", "localhost:18081") 26 | self.intg = Environment("intg", "", "localhost", "localhost:18081") 27 | self.test = Environment("test", "", "localhost", "localhost:18081") 28 | self.prod = Environment("prod", "", "localhost", "localhost:18081") 29 | 30 | def getEnvironment(self, sc: SparkSession) -> Environment: 31 | hostname = socket.gethostname() 32 | if hostname.lower().startswith("v") or hostname.lower().startswith("u"): 33 | return self.local 34 | elif hostname.lower().startswith("intg"): 35 | return self.intg 36 | elif hostname.lower().startswith("test"): 37 | return self.test 38 | elif hostname.lower().startswith("prod"): 39 | return self.prod 40 | 41 | def getEnvironmentByServer(self) -> Environment: 42 | hostname = socket.gethostname() 43 | if hostname.lower().startswith("v") or hostname.lower().startswith("u"): 44 | return self.local 45 | elif hostname.lower().startswith("intg"): 46 | return self.intg 47 | elif hostname.lower().startswith("test"): 48 | return self.test 49 | elif hostname.lower().startswith("prod"): 50 | return self.prod 51 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/objects/enums/Zones.py: -------------------------------------------------------------------------------- 1 | # ToDo - More extentions (if any) to various zones 2 | class Zones: 3 | def __init__(self): 4 | self.stage = "stage" 5 | self.work = "work" 6 | self.publish = "publish" 7 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/objects/enums/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/objects/enums/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/MockupData.py: -------------------------------------------------------------------------------- 1 | from random import Random 2 | from typing import Optional, Any 3 | 4 | from pyspark.sql.types import * 5 | 6 | 7 | # ToDo Yet to complete Random data generation 8 | class Maybe(object): 9 | def get_or_else(self, default): 10 | return self.value if isinstance(self, Just) else default 11 | 12 | 13 | class Just(Maybe): 14 | def __init__(self, value): 15 | self.value = value 16 | 17 | 18 | class Nothing(Maybe): 19 | pass 20 | 21 | 22 | # Random data generators for Spark SQL DataTypes. These generators do not generate uniformly random 23 | # values; instead, they're biased to return "interesting" values (such as maximum / minimum values) 24 | # with higher probability. 25 | class MockupData: 26 | # The conditional probability of a non-null value being drawn from a set of "interesting" values 27 | # instead of being chosen uniformly at random. 28 | PROBABILITY_OF_INTERESTING_VALUE: float = 0.5 29 | 30 | # The probability of the generated value being null 31 | PROBABILITY_OF_NULL: float = 0.1 32 | 33 | MAX_STR_LEN: int = 1024 34 | MAX_ARR_SIZE: int = 128 35 | MAX_MAP_SIZE: int = 128 36 | 37 | # Returns a randomly generated schema, based on the given accepted types. 38 | # @param numFields the number of fields in this schema 39 | # @param acceptedTypes types to draw from. 40 | def randomSchema(self, rand: Random, numFields: int, acceptedTypes: list[DataType]) -> StructType: 41 | structfields = [] 42 | i = 0 43 | while i < numFields: 44 | dt = acceptedTypes[rand.randint(1, len(acceptedTypes))] 45 | structfields.append(StructField(f"col_{i}", dt, nullable=bool(rand.getrandbits(1)))) 46 | return StructType(structfields) 47 | 48 | # Returns a function which generates random values for the given `DataType`, or `None` if no 49 | # random data generator is defined for that data type. The generated values will use an external 50 | # representation of the data type; for example, the random generator for `DateType` will return 51 | # instances of [[java.sql.Date]] and the generator for `StructType` will return a [[Row]]. 52 | # For a `UserDefinedType` for a class X, an instance of class X is returned. 53 | # #@param dataType the type to generate values for 54 | # @param nullable whether null values should be generated 55 | # @param rand an optional random number generator 56 | # @return a function which can be called to generate random values. 57 | def forType(self, dataType: DataType, nullable: bool, rand: Random = Random()) -> Optional[Any]: 58 | return Optional[Any]() 59 | 60 | # Generates a random row for `schema`. 61 | def randomRow(self, rand: Random, schema: StructType) -> Row: 62 | fields = list(StructField) 63 | for f in schema.fields: 64 | if str(f.dataType).lower().__eq__("arraytype"): 65 | data = None 66 | if f.nullable and rand.random() <= self.PROBABILITY_OF_NULL: 67 | data = None 68 | else: 69 | arr = [] 70 | n = 1 71 | i = 0 72 | _f: ArrayType = f.dataType() 73 | generator = self.forType(_f.elementType, f.nullable, rand) 74 | assert (generator.isDefined, "Unsupported") 75 | gen = generator.get 76 | while i < n: 77 | arr.append(gen) 78 | i = i + 1 79 | data = arr 80 | fields.append(data) 81 | elif str(f.dataType).lower().__eq__("structtype"): 82 | _f: StructType = f 83 | for c in _f: 84 | fields.append(self.randomRow(rand, StructType(c.dataType()))) 85 | else: 86 | generator = self.forType(f.dataType, f.nullable, rand) 87 | assert (generator.isDefined, "Unsupported") 88 | gen = generator.get 89 | fields.append(gen) 90 | return Row(*fields) 91 | 92 | # Returns a random nested schema. This will randomly generate structs and arrays drawn from 93 | # acceptedTypes. 94 | def randomNestedSchema(self, rand: Random, totalFields: int, acceptedTypes: list[DataType]) -> StructType: 95 | fields = [] 96 | i = 0 97 | numFields = totalFields 98 | while numFields > 0: 99 | v = rand.randint(0, 3) 100 | if v is 0: 101 | # Simple type 102 | dt = acceptedTypes[rand.randint(0, len(acceptedTypes))] 103 | fields.append(StructField(f"col_{i}", dt, bool(rand.getrandbits(1)))) 104 | numFields = -1 105 | elif v is 1: 106 | # Array 107 | dt = acceptedTypes[rand.randint(0, len(acceptedTypes))] 108 | fields.append(StructField(f"col_{i}", ArrayType(dt), bool(rand.getrandbits(1)))) 109 | numFields = -1 110 | else: 111 | n = max(rand.randint(0, numFields), 1) 112 | nested = self.randomNestedSchema(rand, n, acceptedTypes) 113 | fields.append(StructField("col_" + i, nested, bool(rand.getrandbits(1)))) 114 | numFields = numFields - n 115 | 116 | i = i + 1 117 | return StructType(fields) 118 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/src/com/vitthalmirji/utils/__init__.py -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/audit_util.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | def audit_action(action): 5 | def audit_decorator(func): 6 | def audit(*args, **kwargs): 7 | # Invoke the wrapped function first 8 | retval = func(*args, **kwargs) 9 | # Now do something here with retval and/or action 10 | logging.debug(f'Executed {action}, Callback return value {retval}') 11 | return retval 12 | return audit 13 | return audit_decorator 14 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/comprehensive_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | import sys 4 | from pathlib import Path 5 | 6 | from com.vitthalmirji.utils.constants import JOB_START_TIME 7 | from com.vitthalmirji.utils.helpers import read_json_get_dict, get_project_root 8 | 9 | 10 | def init_logging(job_name, log_time_stamp=JOB_START_TIME, log_path=f'{get_project_root()}/logs/python', 11 | log_properties_path=f"{get_project_root()}/conf/python/logging-properties.json"): 12 | """ 13 | Initiates the logging object with given configurations 14 | 15 | Args: 16 | :param log_properties_path: Location of properties file. 17 | default to local project folder's /conf/python/logging-properties.json 18 | :param job_name: Name of the application 19 | :param log_time_stamp: Timestamp to append in log file name 20 | :param log_path: Location to store logs. 21 | Default location /logs/python/ 22 | 23 | Returns: N/A 24 | """ 25 | Path(log_path).mkdir(parents=True, exist_ok=True) 26 | log_conf = read_json_get_dict(json_path=log_properties_path) 27 | log_file = f"{log_path}/log-{job_name}_{log_time_stamp}.log" 28 | log_conf['handlers']['file']['filename'] = log_file 29 | 30 | # In case of Unit test cases do not log to file 31 | if 'unittest' in sys.modules.keys(): 32 | log_conf['handlers'] = {'console': log_conf['handlers']['console']} 33 | log_conf['root']['handlers'] = ['console'] 34 | 35 | print('Logging initiating using below properties') 36 | print(log_conf) 37 | logging.config.dictConfig(log_conf) 38 | logging.info(f'Logging initiated; appending logs to {log_file}') 39 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/constants.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | 3 | from com.vitthalmirji.utils.helpers import get_user 4 | 5 | USER = get_user() 6 | JOB_START_TIME = datetime.now().strftime('%Y-%m-%dT%H-%M-%S-%f') 7 | SPARK_APPLICATION_NAME = f"Spark application launched by {USER}" 8 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/data_quality.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | from typing import List 4 | 5 | from pyspark.sql import DataFrame 6 | from pyspark.sql.functions import count 7 | 8 | from com.vitthalmirji.utils.helpers import create_dir, log_exception_details 9 | from com.vitthalmirji.utils.spark import get_or_create_spark_session 10 | 11 | 12 | class Rule(object): 13 | def __init__(self, rule_id: int, name: str, description: str, rule_type: str, columns: List[str] = None, 14 | query: str = None): 15 | self.rule_id = rule_id 16 | self.name = name 17 | self.description = description 18 | self.rule_type = rule_type 19 | self.columns = columns if columns else None 20 | self.query = query if query else None 21 | 22 | 23 | class RuleExecutionResult: 24 | def __init__(self, rule: Rule, status, pass_count: int, fail_count: int, total_count): 25 | self.rule = rule 26 | self.status = status 27 | self.pass_count = pass_count 28 | self.fail_count = fail_count 29 | self.total_count = total_count 30 | 31 | 32 | class DataQuality(object): 33 | def __init__(self, dq_id: int, rules: list[dict] = None, email_execution_report_to: str = None, 34 | execution_reports_dir: str = None): 35 | logging.info( 36 | f"Initializing Data quality service for DQ ID {dq_id}, reports will be available in file {execution_reports_dir}") 37 | self.html_report = None 38 | self.df = None 39 | self.total_count = None 40 | self.execution_results = None 41 | self.dq_id = dq_id 42 | self.rules: List[Rule] = [Rule(**rule) for rule in rules] if rules else None 43 | self.email_execution_report_to = email_execution_report_to if email_execution_report_to else None 44 | self.spark = get_or_create_spark_session() 45 | self.yarn_id = self.spark.sparkContext.applicationId 46 | self.execution_reports_dir = execution_reports_dir if execution_reports_dir else None 47 | if self.execution_reports_dir: 48 | create_dir(self.execution_reports_dir) 49 | 50 | def execute_unique_rule(self, rule: Rule): 51 | """ 52 | Executes Duplicates check on given Primary keys in `rule` 53 | 54 | Args: 55 | :param rule: Rule of type `unique` having list of primary keys 56 | 57 | Returns: 58 | :return: RuleExecutionResult with status fail if duplicates are present pass otherwise and count of duplicates 59 | 60 | Exceptions: 61 | :exception: Thrown by calling functions called in this function 62 | """ 63 | logging.warning(f"Executing DQ Rule for {rule.name} on {rule.columns}") 64 | dups_count = self.df.select(rule.columns).groupby(rule.columns).agg(count("*").alias('cnt')).alias( 65 | 'cnt').filter('cnt > 1').count() 66 | 67 | return RuleExecutionResult(rule, 'fail' if dups_count > 0 else 'pass', self.total_count - dups_count, 68 | dups_count, self.total_count) 69 | 70 | def execute_not_null_rule(self, rule: Rule): 71 | """ 72 | Executes Not null check on given list of columns in `rule` 73 | 74 | Args: 75 | :param rule: Rule of type `not null` having list of columns potentially not null 76 | 77 | Returns: 78 | :return: RuleExecutionResult with status fail if column values are null & pass otherwise and count of null records 79 | 80 | Exceptions: 81 | :exception: Thrown by calling functions called in this function 82 | """ 83 | logging.warning(f"Executing DQ Rule for {rule.name} on {rule.columns}") 84 | filter_string = ' OR '.join(list(map(lambda c: f'{c} IS NULL OR TRIM({c}) = ""', rule.columns))) 85 | not_null_count = self.df.select(rule.columns).filter(filter_string).count() 86 | return RuleExecutionResult(rule, 'fail' if not_null_count > 0 else 'pass', self.total_count - not_null_count, 87 | not_null_count, self.total_count) 88 | 89 | def execute_query_rule(self, rule: Rule): 90 | """ 91 | Executes query given in `rule` 92 | This is in case of custom data quality rule given in form of query 93 | Args: 94 | :param rule: Rule of type `query` having query to execute 95 | 96 | Returns: 97 | :return: RuleExecutionResult with status fail if duplicates are present pass otherwise and count of duplicates 98 | 99 | Exceptions: 100 | :exception: Thrown by calling functions called in this function 101 | """ 102 | self.df.createOrReplaceTempView('temp') 103 | query = rule.query 104 | logging.warning(f"Executing DQ Rule for {rule.name} using query {rule.query}") 105 | query_count = self.spark.sql(query).count() 106 | return RuleExecutionResult(rule, 'fail' if query_count > 0 else 'pass', 107 | self.total_count - query_count, 108 | query_count, self.total_count) 109 | 110 | def execute_rules(self, df: DataFrame) -> tuple[bool, str]: 111 | """ 112 | Executes list of rules (data quality checks) given on dataframe's data 113 | Args: 114 | :param df: Dataframe on which quality checks to be executed 115 | :param rules: List of rules mapped to Rule type 116 | 117 | Returns: 118 | :return: boolean status True if all rules executed successfully without any failures, False otherwise and 119 | HTML report of details executed rules 120 | 121 | Exceptions: 122 | :exception: All exceptions thrown by calling functions called in this function 123 | """ 124 | logging.info("Starting data quality rules executions..") 125 | self.execution_results: List[RuleExecutionResult] = [] 126 | self.df = df 127 | self.total_count = self.df.count() 128 | for unique_rule in list(filter(lambda r: r.rule_type.__eq__('unique'), self.rules)): 129 | self.execution_results.append(self.execute_unique_rule(unique_rule)) 130 | 131 | for not_null_rule in list(filter(lambda r: r.rule_type.__eq__('not null'), self.rules)): 132 | self.execution_results.append(self.execute_not_null_rule(not_null_rule)) 133 | 134 | for query_rule in list(filter(lambda r: r.rule_type.__eq__('query'), self.rules)): 135 | self.execution_results.append(self.execute_query_rule(query_rule)) 136 | 137 | return False if list(filter(lambda exec_result: exec_result.status.__eq__('fail'), self.execution_results)) \ 138 | else True, self.generate_report() 139 | 140 | def generate_report(self): 141 | """ 142 | Generates HTML report of result of executed data quality checks 143 | 144 | Args: N/A 145 | 146 | Returns: 147 | :return: self.html_report a HTML report of details about executed DQ checks 148 | 149 | Exceptions: 150 | :exception: All exception thrown by calling functions called in this function 151 | """ 152 | logging.info(f"Preparing Data quality rules report for {self.dq_id}") 153 | table_header = ' '.join(list( 154 | map(lambda header: f"{header}", ["Yarn Application Id", "DQ ID", "Rule ID", "Rule Name", 155 | "Rule type", "Description", "Columns/Query", "Pass Count", 156 | "Fail Count", 157 | "Total Count"]))) 158 | 159 | def rules_and_result(result: RuleExecutionResult): 160 | table_data = [self.yarn_id, 161 | self.dq_id, 162 | result.rule.rule_id, 163 | result.rule.name, 164 | result.rule.rule_type, 165 | result.rule.description, 166 | result.rule.columns, 167 | result.pass_count, 168 | result.fail_count, 169 | result.total_count 170 | ] 171 | return ' '.join(list(map(lambda d: f"{d}", table_data))) 172 | 173 | failed_rules = list(filter(lambda result: result.status.__eq__('fail'), self.execution_results)) 174 | failed_details = ' '.join(list(map(lambda result: f"{rules_and_result(result)}", failed_rules))) 175 | failure_table = f'

Failed DQ details

' \ 176 | f'{table_header}' \ 177 | f'{failed_details}
' if failed_rules else "" 178 | 179 | passed_rules = list(filter(lambda result: result.status.__eq__('pass'), self.execution_results)) 180 | passed_details = ' '.join(list(map(lambda result: f"{rules_and_result(result)}", passed_rules))) 181 | passed_table = f'

Succeeded DQ details

' \ 182 | f'{table_header}' \ 183 | f'{passed_details}
' if passed_rules else "" 184 | 185 | opening_statement = "

Team,

" \ 186 | f"Data Quality check finished successfully for DQ ID = {self.dq_id}" \ 187 | f"{', with failures. ' if failed_rules else '. '}" \ 188 | "Check details in below table of metrics.

" 189 | closing_statement = "

" \ 190 | f"Executed on {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')},
" \ 191 | "Thanks
" 192 | 193 | self.html_report = f"{opening_statement}" \ 194 | f"{failure_table if failed_rules else ''}" \ 195 | f"{passed_table if passed_rules else ''}" \ 196 | f"{closing_statement}" 197 | 198 | return self.html_report 199 | 200 | def write_report_to_html(self, file_name): 201 | """ 202 | Writes Data Quality rules execution results to a html file 203 | 204 | Args: 205 | :param file_name: name of file to write report as HTML file 206 | 207 | Returns: 208 | :return: N/A 209 | 210 | Exceptions: 211 | :exception Throws exception if unable to write into html file but will not halt the execution process 212 | """ 213 | logging.info(f"Writing data quality execution report to html file {self.execution_reports_dir}/{file_name}") 214 | try: 215 | if not self.execution_reports_dir: 216 | raise Exception("Empty file path") 217 | f = open(self.execution_reports_dir + "/" + file_name, "w") 218 | f.write(self.html_report) 219 | f.close() 220 | except Exception as ex: 221 | log_exception_details(message="Error writing report to html, skipping writing report", 222 | exception_object=ex) 223 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/helpers.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import json 3 | import logging 4 | import traceback 5 | from pathlib import Path 6 | 7 | import isodate 8 | from isodate import ISO8601Error 9 | 10 | 11 | def create_dir(dir_path): 12 | """ 13 | Creates directory from given path 14 | :param dir_path: relative path of directory to create 15 | :return: N/A 16 | """ 17 | try: 18 | Path(dir_path).mkdir(parents=True, exist_ok=True) 19 | except Exception as ex: 20 | msg = f"Error creating directory from given relative path {dir_path}" 21 | log_exception_details(message=msg, exception_object=ex) 22 | raise ex 23 | 24 | 25 | def get_user(): 26 | """ 27 | Fetches username of the executor 28 | 29 | Args: 30 | 31 | Returns: 32 | :return: username of the executor / logged in machine 33 | """ 34 | return getpass.getuser() 35 | 36 | 37 | def is_null_or_empty(obj) -> bool: 38 | """ 39 | Checks if an object is null or empty if object is of type string 40 | 41 | Args: 42 | :param obj: object / variable to validate 43 | 44 | Returns: 45 | :return: bool True of object is null or string is empty False otherwise 46 | """ 47 | if obj is None: 48 | return True 49 | elif type(obj) is str and str(obj).strip().__eq__(''): 50 | return True 51 | else: 52 | return False 53 | 54 | 55 | def get_project_root() -> Path: 56 | """ 57 | Identifies project root, Returns project root, the repository root 58 | Args: 59 | 60 | Returns: 61 | :return: project's root path as type Path 62 | """ 63 | return Path(__file__).parent.parent.parent.parent.parent 64 | 65 | 66 | def read_json_get_dict(json_path) -> dict: 67 | """ 68 | Reads json file from given `json_path` & returns as python dict 69 | Args: 70 | :param :json_path : Absolute or Relative path of json file to read & convert 71 | 72 | Return: 73 | :return :json_as_dict: JSON content as dictionary type 74 | """ 75 | try: 76 | with open(json_path, 'r') as stream: 77 | json_as_dict = json.load(stream) 78 | stream.close() 79 | return json_as_dict 80 | except Exception as ex: 81 | log_exception_details(f'Error reading json file {json_path}, error traceback below', ex) 82 | 83 | 84 | def log_exception_details(message, exception_object): 85 | """ 86 | Logs the exception to console & log file for every exception 87 | 88 | Args: 89 | :param message: Developer's message on exception 90 | :param exception_object: Class object of the exception 91 | 92 | Returns: N/A 93 | """ 94 | logging.error(exception_object.__str__()) 95 | logging.error(traceback.format_exc()) 96 | logging.exception(message) 97 | 98 | 99 | def convert_iso_to_time_duration(iso_time_duration: str): 100 | """ 101 | Converts ISO time duration to time in hours, minutes & seconds 102 | 103 | Args: 104 | :param iso_time_duration: ISO time in string Example: PT1H, PT100M, PT2H5M 105 | 106 | Returns: 107 | :return: Returns duration as datetime.timedelta type. 108 | Example: 01:00:00, 01:40:00, 02:05:00 109 | """ 110 | if is_null_or_empty(iso_time_duration): 111 | msg = f'Empty or Invalid time duration string {iso_time_duration}' 112 | logging.error(msg) 113 | return None 114 | try: 115 | return isodate.parse_duration(iso_time_duration) 116 | except ISO8601Error as isoError: 117 | msg = f"Error converting ISO time {iso_time_duration} to timedelta" 118 | log_exception_details(message=msg, exception_object=isoError) 119 | return None 120 | 121 | 122 | def add_iso_time_duration(time1: str, time2: str): 123 | """ 124 | Adds two string time duration, first converts to timedelta then adds to return the result 125 | Args: 126 | :param time1: First time as string value 127 | :param time2: Second time as string value 128 | 129 | Returns: 130 | :return: time1 + time2 as datetime.timedelta type 131 | """ 132 | if is_null_or_empty(time1) or is_null_or_empty(time2): 133 | msg = f'Empty or Invalid time duration string time1 = {time1}, time2 = {time2}' 134 | logging.error(msg) 135 | return None 136 | 137 | try: 138 | _time1 = convert_iso_to_time_duration(iso_time_duration=time1) 139 | _time2 = convert_iso_to_time_duration(iso_time_duration=time2) 140 | return isodate.duration_isoformat((_time1 + _time2)) 141 | except ISO8601Error as isoError: 142 | msg = f"Error converting ISO time time1={time1} & time2={time2} to timedelta" 143 | logging.error(msg) 144 | log_exception_details(message=msg, exception_object=isoError) 145 | return None 146 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/logging_util.py: -------------------------------------------------------------------------------- 1 | import atexit 2 | import logging 3 | from logging.config import ConvertingList, ConvertingDict, valid_ident 4 | from logging.handlers import QueueHandler, QueueListener 5 | from queue import Queue 6 | 7 | 8 | def _resolve_handlers(l): 9 | if not isinstance(l, ConvertingList): 10 | return l 11 | # Indexing the list performs the evaluation. 12 | return [l[i] for i in range(len(l))] 13 | 14 | 15 | def _resolve_queue(q): 16 | if not isinstance(q, ConvertingDict): 17 | return q 18 | if '__resolved_value__' in q: 19 | return q['__resolved_value__'] 20 | 21 | cname = q.pop('class') 22 | klass = q.configurator.resolve(cname) 23 | props = q.pop('.', None) 24 | kwargs = {k: q[k] for k in q if valid_ident(k)} 25 | result = klass(**kwargs) 26 | if props: 27 | for name, value in props.items(): 28 | setattr(result, name, value) 29 | 30 | q['__resolved_value__'] = result 31 | return result 32 | 33 | 34 | class QueueListenerHandler(QueueHandler): 35 | def __init__(self, handlers, respect_handler_level=False, auto_run=True, queue=Queue(-1)): 36 | queue = _resolve_queue(queue) 37 | super().__init__(queue) 38 | handlers = _resolve_handlers(handlers) 39 | self._listener = QueueListener( 40 | self.queue, 41 | *handlers, 42 | respect_handler_level=respect_handler_level) 43 | if auto_run: 44 | self.start() 45 | atexit.register(self.stop) 46 | 47 | def start(self): 48 | self._listener.start() 49 | 50 | def stop(self): 51 | self._listener.stop() 52 | 53 | # def emit(self, record): 54 | # return super().emit(record) 55 | 56 | 57 | BLACK, RED, GREEN, YELLOW, BLUE, MAGENTA, CYAN, WHITE = range(8) 58 | 59 | # These are the sequences need to get colored output 60 | RESET_SEQ = "\033[0m" 61 | COLOR_SEQ = "\033[0;%dm" 62 | BOLD_SEQ = "\033[1m" 63 | 64 | COLORS = { 65 | 'WARNING': YELLOW, 66 | 'INFO': GREEN, 67 | 'DEBUG': MAGENTA, 68 | 'CRITICAL': RED, 69 | 'ERROR': RED 70 | } 71 | 72 | 73 | class ColoredFormatter(logging.Formatter): 74 | def __init__(self, msg, use_color=True): 75 | logging.Formatter.__init__(self, msg) 76 | self.use_color = use_color 77 | 78 | def format(self, record): 79 | if self.use_color and record.levelname in COLORS: 80 | # The background is set with 40 plus the number of the color, and the foreground with 30 81 | record.levelname = COLOR_SEQ % (30 + COLORS[record.levelname]) + record.levelname + RESET_SEQ 82 | return logging.Formatter.format(self, record) 83 | 84 | 85 | def formatter_message(message, use_color=True): 86 | if use_color: 87 | message = message.replace("$RESET", RESET_SEQ).replace("$BOLD", BOLD_SEQ) 88 | else: 89 | message = message.replace("$RESET", "").replace("$BOLD", "") 90 | return message 91 | 92 | 93 | class ColoredLogger(logging.Logger): 94 | # FORMAT = "$BOLD%(name)-20s$RESET][%(levelname)-18s] %(message)s ($BOLD%(filename)s$RESET:%(lineno)d)" 95 | def __init__(self, name): 96 | logging.Logger.__init__(self, name, logging.DEBUG) 97 | self.FORMAT = '%(asctime)s %(name)-15s [$BOLD%(levelname)-10s$RESET] %(process)-10d %(funcName)-30s %(message)s' 98 | self.COLOR_FORMAT = formatter_message(self.FORMAT, True) 99 | color_formatter = ColoredFormatter(self.COLOR_FORMAT) 100 | 101 | console = logging.StreamHandler() 102 | console.setFormatter(color_formatter) 103 | 104 | self.addHandler(console) 105 | return 106 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/spark.py: -------------------------------------------------------------------------------- 1 | import logging 2 | from typing import List 3 | 4 | from pyspark.sql import SparkSession, DataFrame 5 | from pyspark.sql.functions import concat_ws, col, floor, rand 6 | from pyspark.sql.types import StringType 7 | 8 | from com.vitthalmirji.utils.helpers import log_exception_details, is_null_or_empty 9 | 10 | 11 | def get_or_create_spark_session(need_hive_support: bool = False, 12 | spark_conf: List[dict] = [{'key': 'spark.app.name', 'value': ''}]) -> SparkSession: 13 | """ 14 | Creates a spark session with given configuration in parameters 15 | 16 | Args: 17 | :param application_name: Name of the spark application 18 | :param spark_conf: Specific Spark Configurations at user level (default is None) 19 | :param need_hive_support: Enable Hive support in spark session? (default is False) 20 | 21 | Returns: 22 | An object of SparkSession 23 | 24 | Exceptions: 25 | Throws any exception on to calling function that has encountered during creating SparkSession 26 | :exception type of exception is broader, this can be improvised to handle more specific exceptions 27 | """ 28 | spark: SparkSession 29 | try: 30 | spark: SparkSession = SparkSession.getActiveSession() 31 | if spark: 32 | logging.warning("Returning active spark session") 33 | return spark 34 | 35 | logging.warning(f"Creating spark session first time with configs {spark_conf}") 36 | 37 | if need_hive_support: 38 | spark = SparkSession.builder \ 39 | .enableHiveSupport() \ 40 | .getOrCreate() 41 | else: 42 | spark = SparkSession.builder \ 43 | .getOrCreate() 44 | 45 | for conf in list(spark_conf): 46 | spark.conf.set(**conf) 47 | 48 | logging.warning(f"Executor cores = {spark.conf.get('spark.executor.cores', 'Not set')}") 49 | logging.warning(f"Num Executors = {spark.conf.get('spark.executor.instances', 'Not set')}") 50 | return spark 51 | except Exception as ex: 52 | log_exception_details(message="Error creating spark session", exception_object=ex) 53 | raise ex 54 | 55 | 56 | def read_data_as_spark_dataframe(filetype: str, location: str, options={}, table_name=None) -> DataFrame: 57 | """ 58 | Reads various kind of files & tables in spark 59 | Args: 60 | :param filetype: 61 | :param location: 62 | :param options: 63 | :param table_name: 64 | 65 | Returns: 66 | :return: A DataFrame object 67 | 68 | Exception: 69 | Throws any exception that is encountered during file / table read in spark 70 | :exception type of exception is broader, this can be improvised to handle more specific exceptions 71 | """ 72 | logging.warning(f"Attempting to read {filetype} in spark using configs {options} from location {location}") 73 | spark = get_or_create_spark_session() 74 | try: 75 | if str(filetype).lower().__eq__('table'): 76 | if is_null_or_empty(table_name) is not None: 77 | try: 78 | _ = spark.read.options(**options).table(table_name) 79 | except Exception as ex: 80 | log_exception_details(message=f"Error reading table {table_name}", exception_object=ex) 81 | raise ex 82 | else: 83 | print(f"Invalid table {table_name} -Table do not exist in SQL Context: ") 84 | elif str(filetype).lower().__eq__('text'): 85 | logging.warning( 86 | "Lines will be read from the text file and dataframe will have single column by name 'line'") 87 | return spark.read.options(**options).text(paths=location).toDF('line') 88 | elif str(filetype).lower().__eq__('csv'): 89 | return spark.read.options(**options).csv(path=location) 90 | elif str(filetype).lower().__eq__('xml'): 91 | return spark.read.format('com.databricks.spark.xml').options(**options).load(path=location) 92 | elif str(filetype).lower().__eq__('json'): 93 | return spark.read.options(**options).json(path=location) 94 | elif str(filetype).lower().__eq__('orc'): 95 | return spark.read.options(**options).orc(location) 96 | elif str(filetype).lower().__eq__('parquet'): 97 | return spark.read.options(**options).parquet(location) 98 | else: 99 | raise Exception(f"Invalid filetype: {filetype}") 100 | except Exception as ex: 101 | log_exception_details(message=f"Error reading file in Spark of filetype {filetype}", exception_object=ex) 102 | raise ex 103 | 104 | 105 | def revise_shuffle_partitions(multiplier: int = 1): 106 | """ 107 | Sets the shuffle partition to total number of cores across all executors 108 | Useful in dataframe operations using spark 109 | :param multiplier: In case of stage failures increase the multiplier 110 | :return: N/A 111 | """ 112 | spark = get_or_create_spark_session() 113 | num_executors = int(spark.conf.get('spark.executor.instances', '2').strip()) 114 | num_cores = int(spark.conf.get('spark.executors.cores', '1').strip()) 115 | revised_shuffle_partition = num_executors * num_cores * multiplier 116 | spark.conf.set('spark.sql.shuffle.partitions', f"{revised_shuffle_partition}") 117 | 118 | 119 | def data_frame_repartition(df: DataFrame, num_files: int = None, use_coalesce=False, repartition_columns=None): 120 | """ 121 | Function to repartition data for better performance. 122 | Majorly has 2 types: #1 - coalesce: to narrow down files in output; #2 - repartition: to uniformly distribute data in output 123 | Note: This involves shuffling (wide transformation) 124 | Args: 125 | :param df: Dataframe on which repartition (wide transformation) to be performed 126 | :param num_files: Number of output files required 127 | :param use_coalesce: Use this to narrow down the number of files irrespective of any columns default is False 128 | :param repartition_columns: Columns on which repartition to be performed 129 | Most important note: Columns specified here must & should be low cardinality values in table 130 | Returns: 131 | :return: Dataframe with repartition or coalesce transformation applied 132 | """ 133 | if use_coalesce: 134 | return df.coalesce(num_files) 135 | 136 | columns_list = list(map(lambda column: col(column).cast(StringType()), 137 | repartition_columns)) if repartition_columns is not None else [] 138 | 139 | if num_files is None and len(columns_list) > 0: 140 | return df.repartition(*columns_list) 141 | 142 | salting_column = floor(rand() * num_files) 143 | temp_repartition_column = 'temp_repartition_column' 144 | return df.withColumn( 145 | temp_repartition_column, 146 | concat_ws('~', *columns_list, salting_column) 147 | ).repartition(temp_repartition_column).drop(temp_repartition_column) 148 | 149 | 150 | def standardize_and_rename_df_columns(df: DataFrame, column_names_to_rename: dict): 151 | """ 152 | Performs renaming column names on given dataframe: 153 | Trims if column name has leading & trailing whitespaces 154 | For given dictionary of columns renames according to specified name 155 | Args: 156 | :param df: DataFrame for renaming columns 157 | :param column_names_to_rename: dictionary having existing column name & revised / renaming column name 158 | 159 | Returns: 160 | :return: _df transformed dataframe with column names renamed 161 | 162 | Exceptions: 163 | :exception Throws exception that's encountered during renaming column on dataframe 164 | """ 165 | _df = df 166 | try: 167 | # Trim and lowercase all column names 168 | for column_name in filter(lambda c: not column_names_to_rename.keys().__contains__(c), df.columns): 169 | _df = _df.withColumnRenamed(column_name, column_name.strip().lower()) 170 | 171 | for column_name, revised_column_name in column_names_to_rename.items(): 172 | _df = _df.withColumnRenamed(column_name, revised_column_name) 173 | return _df 174 | except Exception as ex: 175 | log_exception_details(message=f"Error renaming columns on given dataframe {column_names_to_rename}", 176 | exception_object=ex) 177 | raise ex 178 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/spark_submit_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import logging.config 3 | from datetime import datetime 4 | 5 | from com.vitthalmirji.utils.Utilities import get_dates_between_range 6 | 7 | START_TIME = datetime.now().isoformat().__str__() 8 | 9 | 10 | class StaticConfigParameterNotFound(Exception): 11 | pass 12 | 13 | 14 | def sort_spark_submit_options(command_options): 15 | sorted_command_options = sorted(command_options, key=lambda k: k[0]) 16 | return sorted_command_options 17 | 18 | 19 | def update_conf(command_options): 20 | _conf = {k: v for k, v in command_options.items() if k == '--conf'}['--conf']['value'] 21 | _conf = get_spark_conf_key_value_as_string(_conf) 22 | _conf = f"\"{_conf}\"" 23 | command_options['--conf'].update({'value': _conf}) 24 | 25 | return command_options 26 | 27 | 28 | def get_class_arguments_as_string(command): 29 | return ' \\\n'.join(list(map(lambda c: f"{c}={command['--class_arguments']['value'][c]}", 30 | command['--class_arguments']['value']))) 31 | 32 | 33 | def get_spark_conf_key_value_as_string(conf): 34 | return f"""{",".join([f"{d}={conf[d]}" for d in conf])}\"""" 35 | 36 | 37 | def static_config_args_sanity_check(command, config): 38 | for cmd in command: 39 | if config.get(cmd) is None and command[cmd]['required'] is True: 40 | logging.error(f"Configuration file do not have required spark-submit option {cmd}") 41 | raise StaticConfigParameterNotFound( 42 | f"ERROR: Configuration file do not have required spark-submit option {cmd}") 43 | elif config.get(cmd) is not None: 44 | command[cmd].update({'value': config.get(cmd)}) 45 | else: 46 | continue 47 | return command 48 | 49 | 50 | def update_spark_submit_option_values(runtime_args, config_args, command): 51 | config_args['default']['--conf'].update(config_args[runtime_args['workflow']]['spark_conf']) 52 | config_args['default']['--name'] = f"\"{runtime_args['workflow']}\"" 53 | command['--class_arguments']['value'].update(runtime_args) 54 | return config_args, command 55 | 56 | 57 | def prepare_spark_submit(runtime_args, config_args, app_config): 58 | command = app_config['spark_submit_options_order'] 59 | _config_args, command = update_spark_submit_option_values(runtime_args, config_args, command) 60 | _config_args = _config_args['default'] 61 | 62 | command = static_config_args_sanity_check(command, _config_args) 63 | 64 | command_date_ranges = get_dates_between_range(refresh_type=runtime_args['refreshType'], 65 | start_date=runtime_args['startDate'], 66 | end_date=runtime_args['endDate'], 67 | interval_in_days=app_config['default_settings'][ 68 | 'history_load_interval_in_days'], 69 | date_pattern='%Y-%m-%d') 70 | logging.debug(f"Date Range = {command_date_ranges}") 71 | command = update_conf(command_options=command) 72 | spark_submit_command = ' \\\n'.join(f"{k} {v['value']}" for k, v in command.items() if k != '--class_arguments') 73 | 74 | logging.debug(command) 75 | command_list = [] 76 | for d in command_date_ranges: 77 | command['--class_arguments']['value'].update(d) 78 | class_args = get_class_arguments_as_string(command) 79 | command_list.append(f"{spark_submit_command}\n{class_args}") 80 | return command_list 81 | -------------------------------------------------------------------------------- /src/com/vitthalmirji/utils/transformation_extension.py: -------------------------------------------------------------------------------- 1 | from pyspark.rdd import RDD 2 | from pyspark.sql.dataframe import DataFrame 3 | 4 | 5 | def transform(self, f): 6 | return f(self) 7 | 8 | 9 | RDD.transform = transform 10 | DataFrame.transform = transform 11 | -------------------------------------------------------------------------------- /tests/EtlTransformTest.py: -------------------------------------------------------------------------------- 1 | import time 2 | import unittest 3 | 4 | from etl import Transform 5 | from etl.meta import MetaModel 6 | from utils.Utilities import SparkSettings 7 | 8 | start_time = time.time() 9 | 10 | 11 | class MyTestCase(unittest.TestCase): 12 | def testEtlTransformations(self): 13 | self.spark = SparkSettings("EtlTransformTest").getSparkSession() 14 | metamodel = MetaModel(datamodelpath='resources/datamodel.csv', sc=self.spark) 15 | 16 | # print(f"Data model as JSON -> \n{metamodel.datamodel}") 17 | 18 | metamodel.readMetadataFromCsv(sc=self.spark, metadatapath='resources/meta.csv', targettable='invoice') 19 | metamodel.readSourceFilesIntoDF() 20 | 21 | targetddl = metamodel.getTargetDdl('PARQUET', True) 22 | # print('------Target DDL ------') 23 | # print('------Target Query ------') 24 | # print(f"{queryhead} {querytail}") 25 | 26 | # self.spark.sql(f"{queryhead} {querytail}").show() 27 | 28 | trans = Transform(targettable='invoice', model=metamodel, sc=self.spark) 29 | 30 | trans.transform() 31 | self.assertIsNotNone(trans) 32 | 33 | 34 | if __name__ == '__main__': 35 | unittest.main() 36 | -------------------------------------------------------------------------------- /tests/UtilsTest.py: -------------------------------------------------------------------------------- 1 | import getpass 2 | import unittest 3 | 4 | from utils.Utilities import create_spark_session, count_words, split_words 5 | 6 | 7 | class UtilsTest(unittest.TestCase): 8 | 9 | def test1_testSparkSettings(self): 10 | print("Testing Spark Settings") 11 | self.spark = create_spark_session(application_name="Utils Test") 12 | self.assertEqual(str(self.spark.version), "2.4.5") 13 | self.assertEqual(str(self.spark.sparkContext.sparkUser()), getpass.getuser()) 14 | 15 | metadf: DataFrame = self.spark.read.option("header", "true").format("csv").load(path="resources/meta.csv") 16 | 17 | self.assertEqual(True, True) 18 | 19 | def test2_test_custom_transformations(self): 20 | print("Testing Environment") 21 | self.spark = create_spark_session(application_name="Utils Test") 22 | line_array = ["Hello,World,How,are,you", "Hello.World.How.are.you", "Hello;World;How;are;you", 23 | "Hello-World-How-are-you", "Hello|World|How|are|you", "Hello World How are you"] 24 | 25 | lines_rdd: RDD[str] = self.spark.sparkContext.parallelize(line_array) 26 | df = lines_rdd.transform(lambda _rdd: split_words(_rdd)).transform(lambda _rdd: count_words(_rdd)) 27 | df.toDF().toDF("Word", "Count").show() 28 | 29 | self.assertTrue(df is not None) 30 | self.assertEqual(df.count(), 5) 31 | 32 | 33 | if __name__ == '__main__': 34 | unittest.main() 35 | -------------------------------------------------------------------------------- /tests/XmlMapperTest.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from com.vitthalmirji.imports.HdfsImport import HdfsImport 4 | from com.vitthalmirji.mapper.Mapper import ComplexDataMapper 5 | 6 | 7 | class XmlMapperTest(unittest.TestCase): 8 | def test_create_hive_ql_for_nested_data_explode(self): 9 | print("Testing HdfsImport readFromSource") 10 | self.sparksettings = SparkSettings("XmlMapperTest") 11 | self.spark = self.sparksettings.getSparkSession() 12 | self.hdfsImport = HdfsImport(self.spark) 13 | 14 | # Read JSON file from given path 15 | json_df = self.spark.read.json(path='resources/clinical_trial/*.xml') 16 | 17 | json_df.printSchema() 18 | 19 | # Register as temporary view / table for flattening queries to execute on 20 | json_df.createOrReplaceTempView('jsontable') 21 | 22 | # self.spark.range(10).select(monotonically_increasing_id()).show() 23 | # self.spark.range(10).select(monotonically_increasing_id()).coalesce(1).show() 24 | # self.spark.range(10).repartition(5).select(monotonically_increasing_id()).coalesce(1).show() 25 | 26 | # Create an object of class XmlMapper from Mapper.py by passing spark variable 27 | xml_mapper: ComplexDataMapper = ComplexDataMapper(sc=self.spark) 28 | 29 | # Call createViews function by passing json_df dataframe, it returns 2 things flattening queries and XPATH ( 30 | # Only for XML; Ignore for JSON) 31 | view_queries = xml_mapper.createViews(df=json_df, root_table_name='jsontable', 32 | columns_cascade_to_leaf_level_with_alias=[ 33 | 'item.organizationId AS pk_organizationId']) 34 | 35 | # Loop through all queries, execute them, physicalize flattened attributes as table - Repeat steps to all 36 | # queries (Nested attributes) 37 | for q in view_queries[0]: 38 | print(f'{q}:' f'{view_queries[0][q]}') 39 | temp_df = self.spark.sql(view_queries[0][q]) 40 | temp_df.rdd.zipWithUniqueId().toDF().printSchema() 41 | temp_df.createOrReplaceTempView(q) 42 | select_cols = [] 43 | for col in temp_df.schema.fields: 44 | if not str(col.dataType).lower().startswith("struct") and not str(col.dataType).lower().startswith( 45 | "array"): 46 | select_cols.append(col.name) 47 | print(f"Total partitions = {temp_df.rdd.getNumPartitions()}") 48 | temp_df.select(select_cols).show() 49 | 50 | 51 | if __name__ == '__main__': 52 | unittest.main() 53 | -------------------------------------------------------------------------------- /tests/aws_test/AwsS3Test.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import subprocess 4 | import unittest 5 | 6 | import boto3 7 | from pyspark.sql.dataframe import DataFrame 8 | from pyspark.sql.session import SparkSession 9 | 10 | from utils.Utilities import list_s3_files 11 | 12 | 13 | class AwsS3Test(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls) -> None: 16 | # create an s3 connection that points to the moto server. 17 | cls.s3_resource_obj = boto3.resource( 18 | "s3", 19 | endpoint_url="http://127.0.0.1:5000" 20 | ) 21 | 22 | cls.s3_client_obj = boto3.client( 23 | "s3", 24 | endpoint_url="http://127.0.0.1:5000" 25 | ) 26 | # start moto server, by default it runs on localhost on port 5000. 27 | cls.process = subprocess.Popen( 28 | ['moto_server', 's3'], 29 | stdout=subprocess.PIPE, 30 | shell=True, 31 | creationflags=subprocess.CREATE_NEW_PROCESS_GROUP 32 | ) 33 | 34 | # create an S3 bucket. 35 | cls.s3_resource_obj.create_bucket(Bucket="bucket") 36 | 37 | # # configure pyspark to use hadoop-aws module. os.environ[ "PYSPARK_SUBMIT_ARGS" ] = '--packages 38 | # "org.apache.hadoop:hadoop-aws:2.7.3" --packages "org.apache.httpcomponents:httpclient:4.2.5" ' \ 39 | # '--packages "org.xerial.snappy:snappy-java:1.1.7.3" pyspark-shell ' 40 | 41 | # get the spark session object and hadoop configuration. 42 | cls.spark: SparkSession = SparkSession.builder.getOrCreate() 43 | cls.hadoop_conf = cls.spark.sparkContext._jsc.hadoopConfiguration() 44 | # mock the aws credentials to access s3. 45 | cls.hadoop_conf.set("fs.s3a.access.key", "dummy-value") 46 | cls.hadoop_conf.set("fs.s3a.secret.key", "dummy-value") 47 | # we point s3a to our moto server. 48 | cls.hadoop_conf.set("fs.s3a.endpoint", "http://127.0.0.1:5000") 49 | # we need to configure hadoop to use s3a. 50 | cls.hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 51 | 52 | @classmethod 53 | def test_dataframe_operation_s3(cls): 54 | # create a pyspark dataframe. 55 | values = [("k1", 1), ("k2", 2)] 56 | columns = ["key", "value"] 57 | df = cls.spark.createDataFrame(values, columns) 58 | # write the dataframe as csv to s3. 59 | df.write.mode('overwrite').csv("s3://bucket/source.csv") 60 | # read the dataset from s3 61 | df = cls.spark.read.csv("s3://bucket/source.csv") 62 | # print Data 63 | df.show() 64 | # assert df is a DataFrame 65 | assert isinstance(df, DataFrame) 66 | 67 | print("test_s3_glue_jobs_locally successfully completed") 68 | 69 | @classmethod 70 | def test_3_create_directory_files_s3(cls): 71 | some_binary_data = b'Here we have some data' 72 | 73 | cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1" + '/')) 74 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/dir1.txt') 75 | 76 | cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1/subdir1" + '/')) 77 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/subdir1/dir1_subdir1.txt') 78 | 79 | cls.s3_client_obj.put_object(Bucket="bucket", Key=("dir1/subdir2" + '/')) 80 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir1/subdi2/dir1_subdir2.txt') 81 | 82 | cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/") 83 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/dir2.txt') 84 | 85 | cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/subdir1/") 86 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/subdir1/dir2_subdir1.txt') 87 | 88 | cls.s3_client_obj.put_object(Bucket="bucket", Key="dir2/subdir2/") 89 | cls.s3_client_obj.put_object(Body=some_binary_data, Bucket='bucket', Key='dir2/subdir2/dir2_subdir2.txt') 90 | 91 | contents = list_s3_files(opt={'Bucket': 'bucket'}) 92 | print(contents) 93 | contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True) 94 | print(contents) 95 | contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True, 96 | file_extension='.csv') 97 | print(contents) 98 | contents = list_s3_files(opt={'Bucket': 'bucket'}, files_only=True, 99 | file_extension='.xml') 100 | print(contents) 101 | 102 | @classmethod 103 | def tearDownClass(cls) -> None: 104 | # shut down the moto server. 105 | os.kill(cls.process.pid, signal.SIGTERM) 106 | 107 | 108 | if __name__ == '__main__': 109 | unittest.main() 110 | -------------------------------------------------------------------------------- /tests/aws_test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/vim89/datapipelines-essentials-python/1bcae0d7b6c56e49dcc002ad261297819d6d1b66/tests/aws_test/__init__.py -------------------------------------------------------------------------------- /tests/aws_test/glue_job.py: -------------------------------------------------------------------------------- 1 | import sys 2 | 3 | from pyspark.context import SparkContext 4 | 5 | 6 | # https://github.com/aws-samples/aws-glue-samples/tree/master/examples 7 | 8 | def run(cli_args, spark): 9 | # init glue pyspark job 10 | glue_args = _get_glue_args(cli_args=cli_args) 11 | spark_session, job = _get_spark_session_and_glue_job(glue_args) 12 | 13 | # run glue job code 14 | source = cli_args["source"] 15 | destination = cli_args["destination"] 16 | df = spark.read.csv(source) 17 | df.write.csv(destination) 18 | 19 | # commit job 20 | _commit_job(job) 21 | 22 | 23 | def _get_spark_session_and_glue_job(glue_args): 24 | from awsglue.context import GlueContext 25 | from awsglue.job import Job 26 | 27 | sc = SparkContext.getOrCreate() 28 | glue_context = GlueContext(sparkContext=sc) 29 | job = Job(glue_context=glue_context) 30 | job.init(glue_args["JOB_NAME"], glue_args) 31 | return glue_context.spark_session, job 32 | 33 | 34 | def _commit_job(job): 35 | job.commit() 36 | 37 | 38 | def _get_glue_args(cli_args): 39 | from awsglue.utils import getResolvedOptions 40 | glue_args = getResolvedOptions(args=cli_args, options=["JOB_NAME", "source", "destination"]) 41 | print(glue_args) 42 | return glue_args 43 | 44 | 45 | if __name__ == "__main__": 46 | run(["source", "destination"]) 47 | -------------------------------------------------------------------------------- /tests/aws_test/test_glue_job.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import subprocess 4 | import unittest 5 | from unittest import mock 6 | 7 | import boto3 8 | from pyspark.sql import SparkSession 9 | 10 | from aws_test import glue_job 11 | from utils.Utilities import delete_s3_bucket 12 | 13 | 14 | class TestGlueJob(unittest.TestCase): 15 | """ 16 | This test class setup a test environment to test our glue job, 17 | runs the glue job and checks the result. 18 | """ 19 | 20 | @classmethod 21 | def setUpClass(cls): 22 | """ 23 | the setup class starts a moto server, creates an S3 bucket, 24 | configures PySpark and Spark and dumps the source dataframe to S3. 25 | """ 26 | S3_MOCK_ENDPOINT = "http://127.0.0.1:5000" 27 | 28 | # setup moto server 29 | # cls.process = subprocess.Popen( 30 | # "moto_server s3", stdout=subprocess.PIPE, 31 | # shell=True, preexec_fn=os.setsid() 32 | # ) 33 | 34 | os.environ['AWS_ACCESS_KEY_ID'] = 'test' 35 | os.environ['AWS_SECRET_ACCESS_KEY'] = 'test' 36 | 37 | cls.process = subprocess.Popen( 38 | ['moto_server', 's3'], 39 | stdout=subprocess.PIPE, 40 | shell=True, 41 | creationflags=subprocess.CREATE_NEW_PROCESS_GROUP 42 | ) 43 | 44 | # create s3 connection, bucket and s3 url's 45 | cls.s3_conn = boto3.resource( 46 | "s3", region_name="eu-central-1", 47 | endpoint_url=S3_MOCK_ENDPOINT 48 | ) 49 | bucket = "bucket" 50 | delete_s3_bucket(bucket) 51 | cls.s3_conn.create_bucket(Bucket=bucket) 52 | cls.s3_source = "s3://{}/{}".format(bucket, "source.csv") 53 | cls.s3_destination = "s3://{}/{}".format(bucket, "destination.csv") 54 | 55 | # Setup spark to use s3, and point it to the moto server. 56 | os.environ[ 57 | "PYSPARK_SUBMIT_ARGS" 58 | ] = """--packages "org.apache.hadoop:hadoop-aws:2.7.4" pyspark-shell""" 59 | cls.spark = SparkSession.builder.getOrCreate() 60 | hadoop_conf = cls.spark.sparkContext._jsc.hadoopConfiguration() 61 | hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 62 | hadoop_conf.set("fs.s3a.access.key", "mock") 63 | hadoop_conf.set("fs.s3a.secret.key", "mock") 64 | hadoop_conf.set("fs.s3a.endpoint", S3_MOCK_ENDPOINT) 65 | 66 | # create source dataframe and write the dataframe as csv to s3 67 | values = [("k1", 1), ("k2", 2)] 68 | columns = ["key", "value"] 69 | df = cls.spark.createDataFrame(values, columns) 70 | df.write.csv(cls.s3_source) 71 | 72 | @mock.patch("glue_job._commit_job") 73 | @mock.patch("glue_job._get_glue_args") 74 | @mock.patch("glue_job._get_spark_session_and_glue_job") 75 | def test_glue_job_runs_successfully(self, m_session_job, m_get_glue_args, m_commit): 76 | """ 77 | we arrange our test function; construct the arguments that we get from the cli, set the return 78 | values of our mocked functions. 79 | we run our glue job and assert if the result is what we expect. 80 | """ 81 | # arrange 82 | cli_args = {"--JOBNAME": 'TestGlueLocal', "--source": self.s3_source, "--destination": self.s3_destination} 83 | 84 | m_session_job.return_value = self.spark, None 85 | m_get_glue_args.return_value = cli_args 86 | 87 | # act 88 | glue_job.run(cli_args=cli_args, spark=self.spark) 89 | 90 | # assert 91 | df = self.spark.read.csv(self.s3_destination) 92 | self.assertTrue(not df.rdd.isEmpty()) 93 | 94 | @classmethod 95 | def tearDownClass(cls): 96 | # shut down moto server 97 | os.killpg(os.getpgid(cls.process.pid), signal.SIGTERM) 98 | 99 | 100 | if __name__ == "__main__": 101 | try: 102 | unittest.main() 103 | except Exception: 104 | TestGlueJob().tearDownClass() 105 | -------------------------------------------------------------------------------- /tests/aws_test/test_mocked_postgres.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import sqlalchemy 4 | from testcontainers.postgres import PostgresContainer 5 | 6 | 7 | class MockedPostgresTest(unittest.TestCase): 8 | @classmethod 9 | def test_docker_run_postgress(cls) -> None: 10 | postgres_container = PostgresContainer("postgres:9.5") 11 | with postgres_container as postgres: 12 | e = sqlalchemy.create_engine(postgres.get_connection_url()) 13 | result = e.execute("SELECT version()") 14 | 15 | @classmethod 16 | def tearDownClass(cls) -> None: 17 | print('Done') 18 | 19 | 20 | if __name__ == '__main__': 21 | try: 22 | unittest.main() 23 | except Exception: 24 | MockedPostgresTest().tearDownClass() 25 | -------------------------------------------------------------------------------- /tests/aws_test/testing_mocked_s3.py: -------------------------------------------------------------------------------- 1 | import os 2 | import signal 3 | import subprocess 4 | import unittest 5 | 6 | import boto3 7 | from pyspark.sql import DataFrame 8 | from pyspark.sql import SparkSession 9 | 10 | 11 | class MockTestGlueJob(unittest.TestCase): 12 | # start moto server, by default it runs on localhost on port 5000. 13 | process = subprocess.Popen( 14 | ['moto_server', 's3'], 15 | stdout=subprocess.PIPE, 16 | shell=True, 17 | creationflags=subprocess.CREATE_NEW_PROCESS_GROUP 18 | ) 19 | 20 | @classmethod 21 | def setUpClass(cls) -> None: 22 | # create an s3 connection that points to the moto server. 23 | s3_conn = boto3.resource( 24 | "s3", endpoint_url="http://127.0.0.1:5000" 25 | ) 26 | # create an S3 bucket. 27 | s3_conn.create_bucket(Bucket="bucket") 28 | # # configure pyspark to use hadoop-aws module. os.environ[ "PYSPARK_SUBMIT_ARGS" ] = '--packages 29 | # "org.apache.hadoop:hadoop-aws:2.7.3" --packages "org.apache.httpcomponents:httpclient:4.2.5" ' \ 30 | # '--packages "org.xerial.snappy:snappy-java:1.1.7.3" pyspark-shell ' 31 | 32 | def test_s3_glue_jobs_locally(self): 33 | # get the spark session object and hadoop configuration. 34 | spark = SparkSession.builder.getOrCreate() 35 | hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration() 36 | # mock the aws credentials to access s3. 37 | hadoop_conf.set("fs.s3a.access.key", "dummy-value") 38 | hadoop_conf.set("fs.s3a.secret.key", "dummy-value") 39 | # we point s3a to our moto server. 40 | hadoop_conf.set("fs.s3a.endpoint", "http://127.0.0.1:5000") 41 | # we need to configure hadoop to use s3a. 42 | hadoop_conf.set("fs.s3.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 43 | # create a pyspark dataframe. 44 | values = [("k1", 1), ("k2", 2)] 45 | columns = ["key", "value"] 46 | df = spark.createDataFrame(values, columns) 47 | # write the dataframe as csv to s3. 48 | df.write.mode('overwrite').csv("s3://bucket/source.csv") 49 | # read the dataset from s3 50 | df = spark.read.csv("s3://bucket/source.csv") 51 | # print Data 52 | df.show() 53 | # assert df is a DataFrame 54 | assert isinstance(df, DataFrame) 55 | print("test_s3_glue_jobs_locally successfully completed") 56 | 57 | @classmethod 58 | def tearDownClass(cls) -> None: 59 | # shut down the moto server. 60 | os.kill(cls.process.pid, signal.SIGTERM) 61 | 62 | 63 | if __name__ == "__main__": 64 | try: 65 | unittest.main() 66 | except Exception: 67 | MockTestGlueJob().tearDownClass() 68 | -------------------------------------------------------------------------------- /tests/resources/config.yml: -------------------------------------------------------------------------------- 1 | # required to connect to redshift 2 | host: my.redshift.cluster.com 3 | port: 5439 4 | database: db 5 | user: userid 6 | password: password 7 | ## optional extras for the dbapi connector 8 | sslmode: require 9 | another_option: 123 -------------------------------------------------------------------------------- /tests/resources/datamodel.csv: -------------------------------------------------------------------------------- 1 | table ,pk ,fk_table,fk_col ,fk_table_jointype 2 | xmltable ,_id , , , 3 | carbank_xmltable,_secid ,xmltable,_id , 4 | species_xmltable,col ,xmltable,_id , 5 | red_xmltable ,MedlineID,xmltable,_id , 6 | product ,id ,purchase,productid,LEFT 7 | purchase ,id , , , 8 | store ,id ,purchase,storeid ,INNER 9 | -------------------------------------------------------------------------------- /tests/resources/meta.csv: -------------------------------------------------------------------------------- 1 | key ,src_system,source_desc,src_database,src_table ,src_filetype,src_file_path ,src_table_description,src_col ,src_col_description,src_col_datatype,key_constraints,src_col_filter,src_col_aggregator,src_col_aggregator_filter,check_column,mode,udf ,udfarguments,target_database,src_table_order,target_col ,target_col_filter,target_col_aggregator,target_col_aggregator_filter,target_table ,target_file_path,target_col_datatype,access_limitation,nullable,comment 2 | carbank_xmltable-_sec_id , , , ,carbank_xmltable,tbl , , ,_sec_id , ,string , , , , , , , , , ,0 ,sec_id , , , ,transformxmltable, ,string , , , 3 | xmltable-_id , , , ,xmltable ,xml , , ,_id , ,string , , , , , , , , , ,0 ,id , , , ,transformxmltable, ,string , , , 4 | xmltable-_mtype , , , ,xmltable ,xml , , ,_mtype , ,string , , , , , , , , , ,0 ,mtype , , , ,transformxmltable, ,string , , , 5 | xmltable-_seqlen , , , ,xmltable ,xml , , ,_seqlen , ,bigint , , , , , , , , , ,0 ,seqlen , , , ,transformxmltable, ,bigint , , , 6 | species_xmltable-col , , , ,species_xmltable,tbl , , ,col , ,string , , , , , , , , , ,0 ,species , , , ,transformxmltable, ,string , , , 7 | red_xmltable-MedlineID , , , ,red_xmltable ,tbl , , ,MedlineID , ,bigint , , , , , , , , , ,0 ,MedlineID , , , ,transformxmltable, ,bigint , , , 8 | product-name , , , ,product ,csv ,resources/product.csv , ,name , ,string , , , , , , ,nvl ,- , ,0 ,name , , , ,invoice , ,string , , , 9 | purchase-purchasedate , , , ,purchase ,csv ,resources/purchase.csv, ,purchasedate , ,string ,pk , , , , , ,nvl ,1/1/1900 , ,0 ,purchasedate , , , ,invoice , ,string , , , 10 | store-name , , , ,store ,csv ,resources/store.csv , ,name , ,string , ,eq('Dadar') , , , , ,nvl ,- , ,0 ,storename , , , ,invoice , ,string , , , 11 | product-name , , , ,product ,csv ,resources/product.csv , ,name , ,string , , , , , , ,nvl ,- , ,1 ,name , , , ,salesummary , ,string , , , 12 | purchase-id , , , ,purchase ,csv ,resources/purchase.csv, ,id , ,string , , , , , , ,count, , ,1 ,totalsolditems , , , ,salesummary , ,string , , , 13 | store-name , , , ,store ,csv ,resources/store.csv , ,name , ,string , , , , , , ,nvl ,- , ,1 ,storename , , , ,salesummary , ,string , , , 14 | salesummary-storename , , , ,salesummary , , , ,storename , , , , , , , , , , , ,0 ,productname , , , ,salesummary , ,string , , , 15 | salesummary-name , , , ,salesummary , , , ,name , , , , , , , , , , , ,0 ,totalsoldproducts, , , ,salesummary , ,string , , , 16 | salesummary-totalsolditems, , , ,salesummary , , , ,totalsolditems, , , , , , , , , , , ,0 ,storename , , , ,salesummary , ,string , , , 17 | -------------------------------------------------------------------------------- /tests/resources/mock_dataframe.txt: -------------------------------------------------------------------------------- 1 | a,b,c 2 | 1,x,2011-01-01 3 | 2,y,2001-04-02 -------------------------------------------------------------------------------- /tests/resources/product.csv: -------------------------------------------------------------------------------- 1 | id,name ,price 2 | 1 ,Wrist Watch,10 3 | 2 ,Shoes ,8 4 | 3 ,Tshirt ,5 5 | 4 ,Jeans ,7 6 | 5 ,Sunglasses ,7 7 | -------------------------------------------------------------------------------- /tests/resources/purchase.csv: -------------------------------------------------------------------------------- 1 | id ,productid,purchasedate,storeid 2 | 100,1 ,10/11/2019 ,1000 3 | 101,3 ,10/12/2019 ,1002 4 | 102,1 , ,1004 5 | 103,1 ,10/14/2019 ,1004 6 | 104,4 ,10/15/2019 ,1003 7 | 105,4 ,10/16/2019 ,1002 8 | -------------------------------------------------------------------------------- /tests/resources/store.csv: -------------------------------------------------------------------------------- 1 | id ,name 2 | 1000,Borivili 3 | 1001,Kandivili 4 | 1002,Andheri 5 | 1003,Bandra 6 | 1004,Dadar 7 | 1005,Byculla 8 | -------------------------------------------------------------------------------- /tests/test_comprehensive_logging.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import unittest 3 | import logging.config 4 | 5 | from com.hellofresh.utils.comprehensive_logging import init_logging 6 | 7 | 8 | class LoggingTestCases(unittest.TestCase): 9 | def test_init_logging(self): 10 | init_logging(job_name='Unit tests') 11 | logger = logging.getLogger('root') 12 | self.assertEqual(logger.level, 10) 13 | 14 | 15 | if __name__ == '__main__': 16 | unittest.main() 17 | -------------------------------------------------------------------------------- /tests/test_data_quality.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | from pathlib import Path 4 | 5 | from com.hellofresh.utils.data_quality import Rule, RuleExecutionResult, DataQuality 6 | from com.hellofresh.utils.helpers import get_project_root, read_json_get_dict 7 | from com.hellofresh.utils.spark import get_or_create_spark_session 8 | 9 | 10 | class DataQualityTestCases(unittest.TestCase): 11 | def test_Rule(self): 12 | rule_dict = { 13 | "rule_id": 1011, 14 | "name": "Primary / Natural Keys", 15 | "description": "Primary / Natural Keys should not have duplicates", 16 | "rule_type": "unique", 17 | "columns": [ 18 | "name" 19 | ] 20 | } 21 | rule = Rule(**rule_dict) 22 | self.assertEqual(rule.rule_id, 1011) 23 | self.assertEqual(rule.name, "Primary / Natural Keys") 24 | 25 | def test_RuleExecutionResult(self): 26 | rule_dict = { 27 | "rule_id": 1011, 28 | "name": "Primary / Natural Keys", 29 | "description": "Primary / Natural Keys should not have duplicates", 30 | "rule_type": "unique", 31 | "columns": [ 32 | "name" 33 | ] 34 | } 35 | rule = Rule(**rule_dict) 36 | result = RuleExecutionResult(rule, 'fail', 0, 0, 0) 37 | self.assertEqual(result.status, 'fail') 38 | self.assertEqual(result.rule, rule) 39 | self.assertEqual(result.rule.rule_type, 'unique') 40 | 41 | def test_data_quality(self): 42 | shutil.rmtree(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks") 43 | t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json" 44 | dq = read_json_get_dict(json_path=t1_dq) 45 | dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks" 46 | dq_rules = DataQuality(**dq) 47 | spark = get_or_create_spark_session() 48 | df = spark.read.option('encoding', 'utf-8').json(f"{get_project_root()}/resources/data/input") 49 | execution_result = dq_rules.execute_rules(df=df) 50 | self.assertEqual(execution_result[0], False) 51 | self.assertTrue(execution_result[1].__contains__('')) 52 | dq_rules.write_report_to_html(file_name="task1-dq-report.html") 53 | self.assertTrue( 54 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file()) 55 | 56 | 57 | if __name__ == '__main__': 58 | unittest.main() 59 | -------------------------------------------------------------------------------- /tests/test_helpers.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import getpass 3 | import unittest 4 | from pathlib import Path 5 | 6 | from isodate import ISO8601Error 7 | 8 | from com.hellofresh.datapipelines.recipe_tasks import determine_cooking_difficulty 9 | from com.hellofresh.utils.comprehensive_logging import init_logging 10 | from com.hellofresh.utils.helpers import get_user, get_project_root, convert_iso_to_time_duration, \ 11 | add_iso_time_duration 12 | 13 | 14 | class UtilsHelpersTestCases(unittest.TestCase): 15 | init_logging(job_name='UtilsHelpersTestCases') 16 | 17 | def test_get_user(self): 18 | user = get_user() 19 | self.assertEqual(user, getpass.getuser()) 20 | 21 | def test_get_project_root(self): 22 | project_root_path: Path = get_project_root() 23 | self.assertEqual(project_root_path.name, 'vim89-data-engineering-test') 24 | 25 | def test_convert_iso_to_time_duration(self): 26 | try: 27 | convert_iso_to_time_duration("") 28 | except ValueError as v: 29 | self.assertEqual(v.__str__(), 'Empty or Invalid time duration string') 30 | 31 | try: 32 | convert_iso_to_time_duration("ABC") 33 | except ISO8601Error as i: 34 | self.assertEqual(i.__str__(), 'Error converting ISO time ABC to timedelta') 35 | 36 | iso_time = convert_iso_to_time_duration("PT100M") 37 | self.assertEqual(iso_time, datetime.timedelta(hours=1, minutes=40)) 38 | 39 | iso_time = convert_iso_to_time_duration("PT") 40 | self.assertEqual(iso_time, datetime.timedelta(0)) 41 | 42 | def test_add_iso_time_duration(self): 43 | try: 44 | add_iso_time_duration(time1="", time2="PT1H") 45 | except ValueError as v: 46 | self.assertEqual(v.__str__(), 'Empty or Invalid time duration string') 47 | 48 | iso_time = add_iso_time_duration(time1="PT100M", time2="PT1H") 49 | self.assertEqual(iso_time, "PT2H40M") 50 | 51 | iso_time = add_iso_time_duration(time1="PT", time2="PT5M") 52 | self.assertEqual(iso_time, "PT5M") 53 | 54 | iso_time = add_iso_time_duration(time1="PT", time2="PT") 55 | self.assertEqual(iso_time, "P0D") 56 | 57 | def test_determine_difficulty(self): 58 | difficulty = determine_cooking_difficulty(cook_time="PT", prep_time="PT") 59 | self.assertEqual(difficulty, ('P0D', 'easy')) 60 | 61 | difficulty = determine_cooking_difficulty(cook_time="PT21H", prep_time="PT") 62 | self.assertEqual(difficulty, ('PT21H', 'hard')) 63 | 64 | difficulty = determine_cooking_difficulty(cook_time="PT", prep_time="PT100M") 65 | self.assertEqual(difficulty, ('PT1H40M', 'hard')) 66 | 67 | 68 | if __name__ == '__main__': 69 | unittest.main() 70 | -------------------------------------------------------------------------------- /tests/test_logging_util.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import unittest 3 | import logging.config 4 | 5 | from utils.Utilities import init_logging 6 | from utils.audit_util import audit_action 7 | 8 | 9 | class TestLoggingUtil(unittest.TestCase): 10 | def test_init_logging(self): 11 | init_logging(log_time_stamp=datetime.datetime.now()) 12 | level20 = logging.getLogger('simpleExample').level 13 | self.assertEqual(level20, 20) 14 | 15 | def test_audit_action(self): 16 | @audit_action(action=f"testing Audit Action Wrapper") 17 | def audit_decorator(): 18 | pass 19 | 20 | 21 | if __name__ == '__main__': 22 | unittest.main() 23 | -------------------------------------------------------------------------------- /tests/test_recipe_tasks.py: -------------------------------------------------------------------------------- 1 | import shutil 2 | import unittest 3 | from pathlib import Path 4 | 5 | from com.hellofresh.datapipelines.recipe_tasks import main, task1, task2, determine_cooking_difficulty, \ 6 | calculate_time_duration_average, standardize_and_rename_df_columns 7 | from com.hellofresh.utils.data_quality import DataQuality 8 | from com.hellofresh.utils.helpers import get_project_root, read_json_get_dict, convert_iso_to_time_duration 9 | from com.hellofresh.utils.spark import get_or_create_spark_session 10 | 11 | 12 | def del_dirs(): 13 | try: 14 | shutil.rmtree(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks") 15 | shutil.rmtree(f"{get_project_root()}/resources/data/output/task1") 16 | shutil.rmtree(f"{get_project_root()}/resources/data/output/task2") 17 | except: 18 | pass 19 | 20 | 21 | class RecipeTasksTestCases(unittest.TestCase): 22 | 23 | @classmethod 24 | def setUpClass(self): 25 | self.args = { 26 | 'input_data_dir': f"{get_project_root()}/resources/data/input", 27 | 'output_data_dir': f"{get_project_root()}/resources/data/output" 28 | } 29 | del_dirs() 30 | 31 | @unittest.skip 32 | def test_main(self): 33 | del_dirs() 34 | t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json" 35 | t2_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json" 36 | main(self.args, t1_dq, t2_dq) 37 | self.assertTrue( 38 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file()) 39 | 40 | def test_task1(self): 41 | del_dirs() 42 | t1_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task1-dq-rules.json" 43 | dq = read_json_get_dict(json_path=t1_dq) 44 | dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks" 45 | dq_rules = DataQuality(**dq) 46 | task1(input_data_path=self.args['input_data_dir'], input_file_type='json', dq_rules=dq_rules, 47 | output_data_path=f"{self.args['output_data_dir']}/task1", spark_opts={'encoding': 'utf-8'}) 48 | 49 | self.spark = get_or_create_spark_session() 50 | df = self.spark.read.parquet(f"{self.args['output_data_dir']}/task1") 51 | self.assertEqual(df.count(), 1042) 52 | self.assertTrue(df.columns.__contains__('cook_time')) 53 | self.assertTrue( 54 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task1-dq-report.html").is_file()) 55 | 56 | def test_task2(self): 57 | t2_dq = f"{get_project_root()}/conf/data-quality/rules/unit_test_configs/recipe-task2-dq-rules.json" 58 | dq = read_json_get_dict(json_path=t2_dq) 59 | dq['execution_reports_dir'] = f"{get_project_root()}/resources/data-quality-reports/recipe-tasks" 60 | dq_rules = DataQuality(**dq) 61 | task2(input_data_path=f"{self.args['output_data_dir']}/task1", input_file_type='parquet', dq_rules=dq_rules, 62 | output_data_path=f"{self.args['output_data_dir']}/task2") 63 | 64 | self.spark = get_or_create_spark_session() 65 | df = self.spark.read.csv(f"{self.args['output_data_dir']}/task2", header=True) 66 | self.assertEqual(df.count(), 3) 67 | self.assertTrue(df.columns.__contains__('avg_total_cooking_time')) 68 | self.assertTrue( 69 | Path(f"{get_project_root()}/resources/data-quality-reports/recipe-tasks/task2-dq-report.html").is_file()) 70 | 71 | def test_determine_cooking_difficulty(self): 72 | difficulty = determine_cooking_difficulty("PT1H", "PT2M") 73 | self.assertEqual(difficulty, ('PT1H2M', 'hard')) 74 | difficulty = determine_cooking_difficulty("PT5M", "PT15M") 75 | self.assertEqual(difficulty, ('PT20M', 'easy')) 76 | difficulty = determine_cooking_difficulty("PT15M", "PT20M") 77 | self.assertEqual(difficulty, ('PT35M', 'medium')) 78 | difficulty = determine_cooking_difficulty("PT", "PT") 79 | self.assertEqual(difficulty, ('P0D', 'easy')) 80 | 81 | try: 82 | difficulty = determine_cooking_difficulty("", "PT1H") 83 | print(difficulty) 84 | except Exception as ex: 85 | self.assertEqual(ex.__str__(), 'Expecting a string None') 86 | 87 | def test_calculate_time_duration_average(self): 88 | list_of_time_duration = list(map(lambda t: convert_iso_to_time_duration(t), ["PT1H", "PT30M", "PT", "PT2H5M"])) 89 | avg = calculate_time_duration_average(list_of_time_duration) 90 | self.assertEqual(avg, 'PT53M45S') 91 | 92 | 93 | if __name__ == '__main__': 94 | unittest.main() 95 | -------------------------------------------------------------------------------- /tests/test_spark.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from pyspark.sql import SparkSession 4 | 5 | from com.hellofresh.utils.comprehensive_logging import init_logging 6 | from com.hellofresh.utils.spark import get_or_create_spark_session, standardize_and_rename_df_columns, \ 7 | read_data_as_spark_dataframe, data_frame_repartition 8 | 9 | 10 | class UtilsSparkTestCases(unittest.TestCase): 11 | init_logging(job_name='UtilsSparkTestCases') 12 | 13 | def test_create_spark_session(self): 14 | spark: SparkSession = get_or_create_spark_session() 15 | self.assertIsNot(spark, None) 16 | self.assertEqual(spark, SparkSession.getActiveSession()) 17 | self.assertEqual(spark.sparkContext.appName.__str__(), 'pyspark-shell') 18 | 19 | def test_standardize_and_rename_df_columns(self): 20 | spark = get_or_create_spark_session() 21 | data = [('Category A', 100, "This is category A"), 22 | ('Category B', 120, "This is category B"), 23 | ('Category C', 150, "This is category C")] 24 | df = spark.sparkContext.parallelize(data).toDF(['cateGory ', ' iD ', 'category description']) 25 | 26 | self.assertEqual(df.columns, ['cateGory ', ' iD ', 'category description']) 27 | 28 | df = standardize_and_rename_df_columns(df=df, 29 | column_names_to_rename={'category description': 'category_description'}) 30 | self.assertEqual(df.columns, ['category', 'id', 'category_description']) 31 | 32 | def test_negative_cases_for_read_data_as_spark_dataframe(self): 33 | 34 | # INVALID 35 | try: 36 | df = read_data_as_spark_dataframe(filetype='invalid', location='a://a.txt') 37 | except Exception as ex: 38 | print(ex.__str__()) 39 | self.assertEqual(ex.__str__(), 'Invalid filetype: invalid') 40 | 41 | # CSV 42 | try: 43 | df = read_data_as_spark_dataframe(filetype='csv', location='a://a.csv') 44 | csv_read = 'successful' 45 | except Exception as ex: 46 | csv_read = 'failed' 47 | 48 | self.assertAlmostEqual(csv_read, 'failed') 49 | 50 | # TEXT 51 | try: 52 | df = read_data_as_spark_dataframe(filetype='text', location='a://a.txt') 53 | text_read = 'successful' 54 | except Exception as ex: 55 | text_read = 'failed' 56 | 57 | self.assertAlmostEqual(text_read, 'failed') 58 | 59 | # XML 60 | try: 61 | df = read_data_as_spark_dataframe(filetype='xml', location='a://a.xml') 62 | xml_read = 'successful' 63 | except Exception as ex: 64 | xml_read = 'failed' 65 | 66 | self.assertAlmostEqual(xml_read, 'failed') 67 | 68 | # Table 69 | try: 70 | df = read_data_as_spark_dataframe(filetype='table', location='a://a.xml') 71 | table_read = 'successful' 72 | except Exception as ex: 73 | table_read = 'failed' 74 | 75 | self.assertAlmostEqual(table_read, 'failed') 76 | 77 | def test_data_frame_repartition(self): 78 | spark = get_or_create_spark_session() 79 | data = [('Category A', 100, "This is category A"), 80 | ('Category B', 120, "This is category B"), 81 | ('Category C', 150, "This is category C")] 82 | df = spark.sparkContext.parallelize(data).toDF(['category', 'id', 'category_description']) 83 | 84 | df = data_frame_repartition(df=df, use_coalesce=True, num_files=1) 85 | self.assertTrue(df is not None) 86 | 87 | df = data_frame_repartition(df=df, num_files=5, repartition_columns=['category']) 88 | self.assertFalse(df.columns.__contains__('temp_repartition_column')) 89 | 90 | 91 | if __name__ == '__main__': 92 | unittest.main() 93 | -------------------------------------------------------------------------------- /tests/test_spark_submit_execution_pool.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | from datetime import datetime 3 | 4 | from utils.Utilities import init_logging, create_multiprocess_pool, execute_bash 5 | 6 | 7 | class TestSparkSubmitExecutionPool(unittest.TestCase): 8 | init_logging(datetime.now()) 9 | 10 | def test_create_multiprocess_pool(self): 11 | bash_commands = [ 12 | 'echo "cmd1"', 13 | 'echo "cmd2"', 14 | 'echo "cmd3"', 15 | 'hadoop version', 16 | 'echo "cmd5"', 17 | 'echo "cmd6"', 18 | 'echo "cmd7"', 19 | 'echo "cmd8"', 20 | 'echo "cmd9"', 21 | 'echo "cmd10"', 22 | 'echo "cmd11"' 23 | ] 24 | results, failures = create_multiprocess_pool( 25 | shared_data={'log_timestamp': datetime.now().isoformat().__str__()}, 26 | command_list=bash_commands, 27 | sleep_time=0, 28 | max_parallel_jobs=6 29 | ) 30 | 31 | bash_commands.append('spark-submit') 32 | _results, _failures = create_multiprocess_pool( 33 | shared_data={'log_timestamp': datetime.now().isoformat().__str__()}, 34 | command_list=bash_commands, 35 | sleep_time=0, 36 | max_parallel_jobs=6 37 | ) 38 | 39 | self.assertEqual(len(failures), 0) 40 | self.assertEqual(len(_failures) > 0, True) 41 | 42 | def test_execute_bash(self): 43 | pid, return_code, yarn_application_id, stdout, stderr = \ 44 | execute_bash(shared_data={'log_timestamp': datetime.now().isoformat().__str__()}, 45 | sleep_time=0, cmd='hadoop version') 46 | 47 | _pid, _return_code, _yarn_application_id, _stdout, _stderr = \ 48 | execute_bash(shared_data={'log_timestamp': datetime.now().isoformat().__str__()}, 49 | sleep_time=0, cmd='spark-submit') 50 | self.assertNotEqual(pid, None) 51 | self.assertNotEqual(stderr, None) 52 | self.assertNotEqual(stdout, None) 53 | 54 | self.assertEqual(len(yarn_application_id), 0) 55 | self.assertEqual(return_code == 0, True) 56 | 57 | self.assertNotEqual(_pid, None) 58 | self.assertNotEqual(_stderr, None) 59 | self.assertNotEqual(_stdout, None) 60 | 61 | self.assertEqual(len(_yarn_application_id), 0) 62 | self.assertEqual(_return_code > 0, True) 63 | 64 | 65 | if __name__ == '__main__': 66 | unittest.main() 67 | -------------------------------------------------------------------------------- /tests/test_spark_submit_utils.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | import logging 3 | import unittest 4 | 5 | from utils.Utilities import init_logging, cast_string_to_date, get_project_root, read_json_get_dict, read_yaml_get_dict 6 | from utils.spark_submit_utils import prepare_spark_submit 7 | 8 | 9 | class TestSparkSubmitUtils(unittest.TestCase): 10 | init_logging(datetime.datetime.now()) 11 | 12 | def test_get_project_root(self): 13 | self.assertEqual(get_project_root().__str__(), '/Users/v0m02sj/PycharmProjects/datapipelines-essentials') 14 | 15 | def test_cast_string_to_date(self): 16 | dt = cast_string_to_date('2020-01-01', '%Y-%m-%d') 17 | _dt = cast_string_to_date('abcdefg', '%Y-%m-%d') 18 | self.assertEqual(type(dt), datetime.datetime) 19 | self.assertEqual(_dt, None) 20 | 21 | def test_prepare_spark_submit_command(self): 22 | application_properties = read_json_get_dict( 23 | json_path=f"{get_project_root()}/main/src/resources/config/application_properties.json") 24 | runtime_args = {} # parse_arguments(application_properties.get('command_line_args')) 25 | runtime_args.update({ 26 | "workflow": "DVSkuDailyChannelWorkFlow", 27 | "refreshType": "history", 28 | "startDate": "2020-01-01", 29 | "endDate": "2020-01-10", 30 | "dq_enabled": "Y", 31 | "configFile": "/Users/v0m02sj/IdeaProjects/channel-perf-data-pipeline/configs/config-prod.yml" 32 | }) 33 | static_args = read_yaml_get_dict(runtime_args['configFile']) 34 | runtime_args.update({'configFile': runtime_args['configFile'].split('/')[-1]}) 35 | commands = prepare_spark_submit(runtime_args=runtime_args, config_args=static_args, 36 | app_config=application_properties) 37 | logging.debug(commands) 38 | self.assertEqual(len(commands) > 0, True) 39 | 40 | 41 | if __name__ == '__main__': 42 | unittest.main() 43 | --------------------------------------------------------------------------------