├── .gitignore ├── README.md ├── dwh_pipelines ├── L0_src_data_generator │ ├── L0_email_bot.py │ ├── __pycache__ │ │ ├── L0_email_bot.cpython-310.pyc │ │ └── src_data_generator.cpython-310.pyc │ └── src_data_generator.py ├── L1_raw_layer │ ├── L1_email_bot.py │ ├── L1_raw_layer_approach.md │ ├── __pycache__ │ │ ├── L1_email_bot.cpython-310.pyc │ │ ├── raw_accommodation_bookings_tbl.cpython-310.pyc │ │ ├── raw_customer_feedbacks_tbl.cpython-310.pyc │ │ ├── raw_customer_info_tbl.cpython-310.pyc │ │ ├── raw_flight_bookings_tbl.cpython-310.pyc │ │ ├── raw_flight_destinations_tbl.cpython-310.pyc │ │ ├── raw_flight_promotion_deals_tbl.cpython-310.pyc │ │ ├── raw_flight_schedules_tbl.cpython-310.pyc │ │ ├── raw_flight_ticket_sales_tbl.cpython-310.pyc │ │ ├── raw_sales_agents_tbl.cpython-310.pyc │ │ └── raw_ticket_prices_tbl.cpython-310.pyc │ ├── raw-to-staging-diagram_2.png │ ├── raw_accommodation_bookings_tbl.py │ ├── raw_customer_feedbacks_tbl.py │ ├── raw_customer_info_tbl.py │ ├── raw_flight_bookings_tbl.py │ ├── raw_flight_destinations_tbl.py │ ├── raw_flight_promotion_deals_tbl.py │ ├── raw_flight_schedules_tbl.py │ ├── raw_flight_ticket_sales_tbl.py │ ├── raw_sales_agents_tbl.py │ └── raw_ticket_prices_tbl.py ├── L2_staging_layer │ ├── L2_email_bot.py │ ├── L2_staging_layer_approach.md │ ├── __pycache__ │ │ ├── L2_email_bot.cpython-310.pyc │ │ ├── stg_accommodation_bookings_tbl.cpython-310.pyc │ │ ├── stg_customer_feedbacks_tbl.cpython-310.pyc │ │ ├── stg_customer_info_tbl.cpython-310.pyc │ │ ├── stg_flight_bookings_tbl.cpython-310.pyc │ │ ├── stg_flight_destinations_tbl.cpython-310.pyc │ │ ├── stg_flight_promotion_deals_tbl.cpython-310.pyc │ │ ├── stg_flight_schedules_tbl.cpython-310.pyc │ │ ├── stg_flight_ticket_sales_tbl.cpython-310.pyc │ │ ├── stg_sales_agents_tbl.cpython-310.pyc │ │ └── stg_ticket_prices_tbl.cpython-310.pyc │ ├── dev │ │ ├── __pycache__ │ │ │ ├── stg_accommodation_bookings_tbl.cpython-310.pyc │ │ │ ├── stg_customer_feedbacks_tbl.cpython-310.pyc │ │ │ ├── stg_customer_info_tbl.cpython-310.pyc │ │ │ ├── stg_flight_bookings_tbl.cpython-310.pyc │ │ │ ├── stg_flight_destinations_tbl.cpython-310.pyc │ │ │ ├── stg_flight_promotion_deals_tbl.cpython-310.pyc │ │ │ ├── stg_flight_schedules_tbl.cpython-310.pyc │ │ │ ├── stg_flight_ticket_sales_tbl.cpython-310.pyc │ │ │ ├── stg_sales_agents_tbl.cpython-310.pyc │ │ │ └── stg_ticket_prices_tbl.cpython-310.pyc │ │ ├── stg_accommodation_bookings_tbl.py │ │ ├── stg_customer_feedbacks_tbl.py │ │ ├── stg_customer_info_tbl.py │ │ ├── stg_flight_bookings_tbl.py │ │ ├── stg_flight_destinations_tbl.py │ │ ├── stg_flight_promotion_deals_tbl.py │ │ ├── stg_flight_schedules_tbl.py │ │ ├── stg_flight_ticket_sales_tbl.py │ │ ├── stg_sales_agents_tbl.py │ │ └── stg_ticket_prices_tbl.py │ ├── prod │ │ ├── __pycache__ │ │ │ ├── create_prod_env.cpython-310.pyc │ │ │ └── create_stg_prod_env.cpython-310.pyc │ │ └── create_stg_prod_env.py │ ├── staging-to-semantic-diagram.png │ └── tests │ │ ├── __pycache__ │ │ ├── test_stg_accommodation_bookings_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_customer_feedbacks_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_customer_info_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_flight_bookings_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_flight_destinations_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_flight_promotion_deals_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_flight_schedules_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_flight_ticket_sales_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_sales_agents_tbl.cpython-310-pytest-7.2.1.pyc │ │ └── test_stg_ticket_prices_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_stg_accommodation_bookings_tbl.py │ │ ├── test_stg_customer_feedbacks_tbl.py │ │ ├── test_stg_customer_info_tbl.py │ │ ├── test_stg_flight_bookings_tbl.py │ │ ├── test_stg_flight_destinations_tbl.py │ │ ├── test_stg_flight_promotion_deals_tbl.py │ │ ├── test_stg_flight_schedules_tbl.py │ │ ├── test_stg_flight_ticket_sales_tbl.py │ │ ├── test_stg_sales_agents_tbl.py │ │ └── test_stg_ticket_prices_tbl.py ├── L3_semantic_layer │ ├── L3_email_bot.py │ ├── __pycache__ │ │ └── L3_email_bot.cpython-310.pyc │ ├── dev │ │ ├── __pycache__ │ │ │ ├── dim_accommodation_bookings_tbl.cpython-310.pyc │ │ │ ├── dim_customer_feedbacks_tbl.cpython-310.pyc │ │ │ ├── dim_customer_info_tbl.cpython-310.pyc │ │ │ ├── dim_flight_bookings_tbl.cpython-310.pyc │ │ │ ├── dim_flight_destinations_tbl.cpython-310.pyc │ │ │ ├── dim_flight_promotion_deals_tbl.cpython-310.pyc │ │ │ ├── dim_flight_schedules_tbl.cpython-310.pyc │ │ │ ├── dim_flight_ticket_sales_tbl.cpython-310.pyc │ │ │ ├── dim_sales_agents_tbl.cpython-310.pyc │ │ │ └── dim_ticket_prices_tbl.cpython-310.pyc │ │ ├── dim_accommodation_bookings_tbl.py │ │ ├── dim_customer_feedbacks_tbl.py │ │ ├── dim_customer_info_tbl.py │ │ ├── dim_flight_bookings_tbl.py │ │ ├── dim_flight_destinations_tbl.py │ │ ├── dim_flight_promotion_deals_tbl.py │ │ ├── dim_flight_schedules_tbl.py │ │ ├── dim_flight_ticket_sales_tbl.py │ │ ├── dim_sales_agents_tbl.py │ │ └── dim_ticket_prices_tbl.py │ ├── prod │ │ ├── __pycache__ │ │ │ └── create_sem_prod_env.cpython-310.pyc │ │ └── create_sem_prod_env.py │ ├── semantic-to-dwh-diagram.png │ └── tests │ │ ├── __pycache__ │ │ ├── test_dim_accommodation_bookings_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_customer_feedbacks_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_customer_info_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_flight_bookings_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_flight_destinations_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_flight_promotion_deals_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_flight_schedules_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_flight_ticket_sales_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_sales_agents_tbl.cpython-310-pytest-7.2.1.pyc │ │ └── test_dim_ticket_prices_tbl.cpython-310-pytest-7.2.1.pyc │ │ ├── test_dim_accommodation_bookings_tbl.py │ │ ├── test_dim_customer_feedbacks_tbl.py │ │ ├── test_dim_customer_info_tbl.py │ │ ├── test_dim_flight_bookings_tbl.py │ │ ├── test_dim_flight_destinations_tbl.py │ │ ├── test_dim_flight_promotion_deals_tbl.py │ │ ├── test_dim_flight_schedules_tbl.py │ │ ├── test_dim_flight_ticket_sales_tbl.py │ │ ├── test_dim_sales_agents_tbl.py │ │ └── test_dim_ticket_prices_tbl.py ├── L4_dwh_layer │ ├── L4_email_bot.py │ ├── __pycache__ │ │ └── L4_email_bot.cpython-310.pyc │ ├── datamarts │ │ ├── ERD - 1st level.png │ │ ├── ERD - 2nd level.png │ │ ├── __pycache__ │ │ │ ├── dim_customers_tbl.cpython-310.pyc │ │ │ ├── dim_date_tbl.cpython-310.pyc │ │ │ ├── dim_dates_tbl.cpython-310.pyc │ │ │ ├── dim_destinations_tbl.cpython-310.pyc │ │ │ ├── dim_flights_tbl.cpython-310.pyc │ │ │ ├── dim_prices_tbl.cpython-310.pyc │ │ │ ├── dim_promotions_tbl.cpython-310.pyc │ │ │ ├── dim_sales_employees_tbl.cpython-310.pyc │ │ │ ├── dim_schedules_tbl.cpython-310.pyc │ │ │ ├── fact_accommodations_tbl.cpython-310.pyc │ │ │ └── fact_sales_tbl.cpython-310.pyc │ │ ├── dim_customers_tbl.py │ │ ├── dim_dates_tbl.py │ │ ├── dim_destinations_tbl.py │ │ ├── dim_flights_tbl.py │ │ ├── dim_prices_tbl.py │ │ ├── dim_promotions_tbl.py │ │ ├── dim_sales_employees_tbl.py │ │ ├── dim_schedules_tbl.py │ │ ├── fact_accommodations_tbl.py │ │ └── fact_sales_tbl.py │ ├── dwh-diagram.png │ ├── reporting_channel │ │ ├── __pycache__ │ │ │ └── app.cpython-310.pyc │ │ ├── app.py │ │ ├── dash-plotly-travel-dashboard_1.png │ │ ├── dash-plotly-travel-dashboard_2.png │ │ └── dash-plotly-travel-dashboard_3.png │ └── user_access_layer │ │ ├── __pycache__ │ │ ├── avg_ticket_prices_by_year.cpython-310.pyc │ │ ├── customer_booking_trend.cpython-310.pyc │ │ ├── flight_bookings_by_age.cpython-310.pyc │ │ ├── ticket_sales_by_age.cpython-310.pyc │ │ ├── top_destinations.cpython-310.pyc │ │ ├── total_sales_by_destination.cpython-310.pyc │ │ ├── total_sales_by_payment_method.cpython-310.pyc │ │ └── total_sales_by_year.cpython-310.pyc │ │ ├── avg_ticket_prices_by_year.py │ │ ├── customer_booking_trend.py │ │ ├── flight_bookings_by_age.py │ │ ├── top_destinations.py │ │ ├── total_sales_by_destination.py │ │ ├── total_sales_by_payment_method.py │ │ └── total_sales_by_year.py ├── dwh_approach.md ├── governance │ └── data_team_rbac.py ├── orchestration │ └── dwh_flows.py └── performance_tuning │ ├── dwh_indexes.py │ └── dwh_partitions.py ├── environment.yaml └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Files 2 | .env 3 | *.json 4 | *.ini 5 | 6 | 7 | 8 | 9 | # Folders 10 | /dwh_pipelines/L0_src_data_generator/src_data 11 | /logs -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Preface 2 | 3 | ***Disclaimer: This was just for fun. In a real-world setting there are more appropriate options that address modern-day warehousing challenges for different businesses depending on the problem statement at hand, so consider your company's (or client's) unique challenges before replicating any techniques used in this blog.*** 4 | 5 | # Why build this...? 6 | 7 | This is a quick-fire project to demonstrate an event-driven Postgres data warehouse can be built using vanilla Python and SQL code while remaining performant and highly available for analytical use cases. 8 | 9 | Here's why I used each language: 10 | 11 | * **Python** \- for creating the pipelines connecting the different layers 12 | 13 | * **SQL** \- for creating and querying the Postgres tables 14 | 15 | 16 | Although cloud data warehouses are growing increasingly popular in the analytics world, there are still scenarios where traditional data warehouses would serve an enterprise better than a cloud data warehouse. More on this will be shared in a future blog post. 17 | 18 | # Configuration 19 | 20 | ### GitHub repo 21 | 22 | I create a GitHub repo called `postgres-dwh` and set up the remote repository on my local machine. 23 | 24 | Once I enter the folder of my choice, I clone the remote repo to the folder: 25 | 26 | ```markdown 27 | git clone 28 | ``` 29 | 30 | …then I re-direct myself into the new local repo: 31 | 32 | ```markdown 33 | cd postgres-dwh 34 | ``` 35 | 36 | …and then create a branch: 37 | 38 | ```markdown 39 | git checkout -b config-branch 40 | ``` 41 | 42 | … then finally create the files to start with (via Windows terminal): 43 | 44 | ```markdown 45 | type NUL > .gitignore & type NUL > environment.yaml & type NUL >README.md & type NUL > .env 46 | ``` 47 | 48 | …and now I’m in! 49 | 50 | ### Environment.yaml 51 | 52 | I'm using conda as the package manager, and pulling my modules from conda-forge because the modules may not be available in the default Anaconda channel. 53 | 54 | Here's what my environment.yaml file looks like: 55 | 56 | ```yaml 57 | name: postgres_dwh 58 | channels: 59 | - conda-forge 60 | dependencies: 61 | - python=3.10 62 | - pandas 63 | - faker 64 | - path 65 | - psycopg2 66 | - configparser 67 | ``` 68 | 69 | Then I create my environment with this command in the terminal (once I’m in the directory of my choice): 70 | 71 | ```yaml 72 | conda env create -f environment.yaml 73 | ``` 74 | 75 | …and then I activate my environment with this: 76 | 77 | ```yaml 78 | conda activate postgres_dwh 79 | ``` 80 | 81 | # Planning (Wireframe) 82 | 83 | ## Layers🌰 84 | 85 | The DWH architecture will consist of: 86 | 87 | * **Raw layer** - for extracting data from source systems 88 | 89 | * **Staging layer** - for cleaning and pre-processing raw data 90 | 91 | * **Semantic layer** - for consolidating and integrating clean data 92 | 93 | * **Data warehouse layer** \- for exposing SVOT to the target end users and (downstream teams, external stakeholders etc) 94 | 95 | * **Presentation layer** - for creating data stories with the SVOT data 96 | 97 | * **Governance layer** - for setting policies and practices to keep enterprise data accurate, consistent and high quality 98 | 99 | * **Orchestration layer** - for managing the execution of workflows via event-based triggers or time intervals 100 | 101 | 102 | From the raw to data warehouse layer I include data profiling metrics that display the properties and statistics involved in each stage to the console so that I can monitor the quality of the data moving between stages. 103 | 104 | Here's what they look like for generating travel data for this project: 105 | 106 | 107 | [![](https://markdown-videos.deta.dev/youtube/soI8m1B2y3g)](https://youtu.be/soI8m1B2y3g) 108 | 109 | If there are any unusual behaviours between processes I can point that out from the data profile metrics and troubleshoot them accordingly. 110 | 111 | # Raw layer 112 | 113 | * Load source tables into raw tables 114 | 115 | * Highlight sensitive fields 116 | 117 | * Add event logging 118 | 119 | * Run data profiling checks 120 | 121 | 122 | The main objective here is to get the source data pulled (or copied) into this section. This means no transformations will be applied at this stage because the data needs to be as close to its original state as possible in a tabular format. 123 | 124 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1677507737420/184026a5-81e2-482b-9c01-be413ab8f69d.png) 125 | 126 | This stage is also useful for assessing the tables, columns and rows that contain any sensitive data that could compromise the privacy of any person or entity connected to the business if not treated appropriately, so I've pointed them out for all tables and laid out the treatment I've adopted in the DWH at the data governance section. 127 | 128 | # Staging layer 129 | 130 | My focus at this layer is to clean the data and shape it into a format suitable for adding business rules. This is done by first outlining the transformation specifications for each table and then setting up their data quality checks to test the constraints placed on the data. 131 | 132 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1677508359417/d674eed8-2999-4f21-bf69-0720a0264bd9.png) 133 | 134 | Once I've specified the transformation strategy, I convert my transformation intents into Python logic, which will carry the heavy-lifting transformations for this project. SQL will simply be used for creating tables and inserting data from the raw tables into staging tables. 135 | 136 | After the DQ tests are complete, the main database used will serve as the development environment (or DEV environment for short). The orchestration tool will spin up a production environment (or PROD environment for short) by replicating the tested DEV environment. 137 | 138 | Here is what the DQ tests and report look like in real-time: 139 | 140 | 141 | [![](https://markdown-videos.deta.dev/youtube/nrKeXpgWSVs)](https://youtu.be/nrKeXpgWSVs) 142 | 143 | The PROD environment will be the ideal environment used for any reporting and analysis, while the DEV will be used for data scientists and analysts to run their tests and experiments. 144 | 145 | # Semantic layer 146 | 147 | At this layer, the cleaned data is framed and tested to form the single version of truth. This is done by using the staging tables to create the MDM tables that will be used for dimensional modelling. 148 | 149 | To kick-start this section a DEV database is created. Then I create empty semantic tables in the database. After that, the staging data is loaded into the semantic tables using foreign data wrappers, and then the business rules are added to the semantic tables. Then a date dimension table is created to add granularity down to the day level. 150 | 151 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1677510285406/c85d0ec6-9603-4ac3-bdf2-08d0080df18c.png) 152 | 153 | The DEV database is replicated after the DQ tests are completed, where the replica will serve as the production database, for similar reasons stated in the last layer. 154 | 155 | The PROD database will contain the enriched data that forms the single version of truth (SVOT). 156 | 157 | The MDM tables in the PROD database will serve as the single version of truth (SVOT). 158 | 159 | # Data warehouse layer 160 | 161 | Here is where the data from the SVOT is available for the target users to perform their intended operations, such as reporting, analysis, investigations and audits, among others. 162 | 163 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1677510300184/e7f6a3d1-edc4-4bbc-bda0-b20681fabb8c.png) 164 | 165 | The first step is to apply dimensional modelling to the data available. This is done by outlining the relationships between different tables (or entities) and then joining the entities together to form facts and dimensions. 166 | 167 | Then I join the appropriate fact and dimensions to create aggregated tables that answer specific business questions raised by the target audience. 168 | 169 | For example, if the sales team wanted to find out how many flight tickets were sold for a specific destination spot, we can create a table from the DWH that groups the sales records by destination, filter the results by date ranges and apply the relevant governance policies to grant the right users access to querying the table. 170 | 171 | # Presentation layer 172 | 173 | This stage is for visualizing the data from the previous stage. 174 | 175 | I plug the aggregate tables into the visualization modules to create the visuals that answer the relevant questions the stakeholders may have. I use Plotly-Dash to create a simple interactive dashboard to demonstrate how a BI tool can easily plug into Postgres: 176 | 177 | ![](https://cdn.hashnode.com/res/hashnode/image/upload/v1677519154148/2cbd71aa-6515-4199-b553-74523dde2a03.png) 178 | 179 | # Governance layer 180 | 181 | I set up the following governance policies: 182 | 183 | * **Role-based access control** - for creating custom roles and assigning privileges based on each team's distinct responsibilities 184 | 185 | * **Row security policies** - for restricting what can be seen in each table for custom roles 186 | 187 | * **Table ownership** - for allocating table ownership privileges to the appropriate roles 188 | 189 | 190 | ## Role-based access control 191 | 192 | Here is where I create custom roles that possess a distinct set of privileges shaped around how each team will access the data warehouse. 193 | 194 | The steps for this stage are as followed: 195 | 196 | 1. Create custom roles using the `CREATE` command 197 | 198 | 2. Grant roles access to use the appropriate databases and schemas using the `GRANT` command 199 | 200 | 3. Grant appropriate privileges to custom roles (options from `SELECT`, `UPDATE`, `INSERT`, `DELETE` etc) using the `GRANT` command 201 | 202 | 4. Restrict roles from accessing certain databases, schemas and tables using the `REVOKE` command 203 | 204 | 205 | For simplicity’s sake, I've created the following custom roles in this project...: 206 | 207 | * junior\_data\_analyst 208 | 209 | * senior\_data\_analyst 210 | 211 | * junior\_data\_engineer 212 | 213 | * senior\_data\_engineer 214 | 215 | * junior\_data\_scientist 216 | 217 | * senior\_data\_scientist 218 | 219 | 220 | ...and supplied general constraints required for the following roles: 221 | 222 | ### Junior data analyst 223 | 224 | * Can select tables and views in DWH layer 225 | 226 | * Cannot create or modify tables and views 227 | 228 | * Cannot delete any objects 229 | 230 | * Cannot execute stored procedures 231 | 232 | * Cannot access the raw, staging and semantic layers 233 | 234 | 235 | ### Senior data analyst 236 | 237 | * Same privileges as `junior_data_analyst` role 238 | 239 | * Cannot access the raw, staging and semantic layers 240 | 241 | 242 | # Orchestration layer 243 | 244 | An orchestration section is needed for managing how each DWH task will be executed and under what conditions too. Among Airflow, Prefect, Argo to pick from, I went with Prefect because the UI displays the workflows in an elegant, clean and easy-to-flow manner, plus configuration is relatively the quickest out of the available options. 245 | 246 | I use `prefect orion start` command in the terminal to access the tasks and flow runs for each DWH activity in the UI, which displays the workflows like this: 247 | 248 | [![](https://markdown-videos.deta.dev/youtube/blAhdR6NySk)](https://youtu.be/blAhdR6NySk) 249 | 250 | # Conclusion 251 | 252 | Contrary to popular belief, PostgreSQL can serve as a great data warehouse option, but this solely depends on the design decisions that go into it. As long as there is an awareness of the computing resource limitations it carries compared to many cloud data warehousing options like Snowflake, Amazon Redshift, and Google BigQuery, the RDMS system can serve as a high-performant data warehouse that meets business reporting needs in many use-cases. 253 | 254 | In my next project I will be walking you through how I migrated this same on-premise travel data into the AWS cloud, and how I built a cloud data warehouse with it using Amazon Redshift that includes more robust technologies like Spark, AWS Athena, AWS Glue and so on. 255 | 256 | [**You can find the full source code for this project on my GitHub here.**](https://github.com/sdw-online/postgres-dwh) 257 | 258 | Feel free to reach out via my handles: [**LinkedIn**](https://www.linkedin.com/in/stephen-david-williams-860428123/)| [**Email**](mailto:stephenodavidwilliams@gmail.com) | [**Twitter**](https://twitter.com/sdw_online) -------------------------------------------------------------------------------- /dwh_pipelines/L0_src_data_generator/L0_email_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from datetime import datetime 4 | from email.mime.multipart import MIMEMultipart 5 | from email.mime.base import MIMEBase 6 | from email.mime.text import MIMEText 7 | from email.utils import COMMASPACE 8 | from email import encoders 9 | from pathlib import Path 10 | from dotenv import load_dotenv 11 | 12 | 13 | # Load environment variables from .env 14 | load_dotenv() 15 | 16 | 17 | 18 | # Set up constants 19 | current_filepath = Path(__file__).stem 20 | 21 | SMTP_PORT = 587 22 | SMTP_HOST_SERVER = "smtp.gmail.com" 23 | CURRENT_TIMESTAMP = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 24 | EMAIL_ADDRESS = os.getenv("SENDER") 25 | EMAIL_PASSWORD = os.getenv("EMAIL_PASSWORD") 26 | SENDER = "Postgres Data Warehouse Program - SDW" 27 | RECIPIENT = os.getenv("RECIPIENT") 28 | 29 | 30 | L0_LOG_DIRECTORY = os.getenv("L0_LOG_DIRECTORY") 31 | L1_LOG_DIRECTORY = os.getenv("L1_LOG_DIRECTORY") 32 | L2_LOG_DIRECTORY = os.getenv("L2_LOG_DIRECTORY") 33 | L3_LOG_DIRECTORY = os.getenv("L3_LOG_DIRECTORY") 34 | L4_LOG_DIRECTORY = os.getenv("L4_LOG_DIRECTORY") 35 | 36 | body_main_subject = "extracting the travel data from the source systems" 37 | body = f"""Hi Stephen, 38 | 39 | See attached the logs for {body_main_subject}. 40 | 41 | Regards, 42 | {SENDER} 43 | 44 | """ 45 | 46 | 47 | # Create function for getting the directory paths for log files 48 | def get_log_filepaths(log_directory): 49 | log_filepaths = [] 50 | for root, directories, log_files in os.walk(log_directory): 51 | for filename in log_files: 52 | log_filepath = os.path.join(root, filename) 53 | log_filepaths.append(log_filepath) 54 | return log_filepaths 55 | 56 | 57 | # Create function for attaching log files to email 58 | def attach_log_files_to_email(message, log_filepaths): 59 | for log_file in log_filepaths: 60 | with open(log_file, 'rb') as file: 61 | log_attachment = MIMEBase('application', 'octet-stream') 62 | log_attachment.set_payload(file.read()) 63 | encoders.encode_base64(log_attachment) 64 | log_attachment.add_header('Content-Disposition', f'attachment; filename="{os.path.basename(log_file)}"') 65 | message.attach(log_attachment) 66 | 67 | 68 | 69 | 70 | # ===================================== SETTING UP LOG FILE ATTACHMENTS ===================================== 71 | 72 | 73 | # Get directory paths for log files 74 | data_gen_log_directory = get_log_filepaths(L0_LOG_DIRECTORY) 75 | 76 | log_file_counter = 0 77 | for log_file in data_gen_log_directory: 78 | log_file_counter += 1 79 | print('') 80 | print(f'Log file {log_file_counter}: {log_file} ') 81 | 82 | 83 | 84 | # ===================================== SETTING UP EMAIL MESSAGE ===================================== 85 | 86 | # Set up constants for email 87 | message = MIMEMultipart() 88 | message["From"] = SENDER 89 | message["To"] = RECIPIENT 90 | message["Subject"] = f"L0 - Travel Data Generation Log - {CURRENT_TIMESTAMP}" 91 | 92 | 93 | # Add body to the email message 94 | message.attach(MIMEText(body, "plain")) 95 | 96 | 97 | # Attach log files to email 98 | attach_log_files_to_email(message, data_gen_log_directory) 99 | 100 | 101 | 102 | # ===================================== SENDING EMAIL MESSAGE ===================================== 103 | 104 | 105 | def send_email(): 106 | with smtplib.SMTP(host=SMTP_HOST_SERVER, port=SMTP_PORT) as smtp: 107 | smtp.ehlo() 108 | smtp.starttls() 109 | smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) 110 | smtp.send_message(message) 111 | print('Message sent successfully. ') 112 | print() 113 | 114 | 115 | send_email() 116 | 117 | 118 | -------------------------------------------------------------------------------- /dwh_pipelines/L0_src_data_generator/__pycache__/L0_email_bot.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L0_src_data_generator/__pycache__/L0_email_bot.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L0_src_data_generator/__pycache__/src_data_generator.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L0_src_data_generator/__pycache__/src_data_generator.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/L1_email_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from datetime import datetime 4 | from email.mime.multipart import MIMEMultipart 5 | from email.mime.base import MIMEBase 6 | from email.mime.text import MIMEText 7 | from email.utils import COMMASPACE 8 | from email import encoders 9 | from pathlib import Path 10 | from dotenv import load_dotenv 11 | 12 | 13 | # Load environment variables from .env 14 | load_dotenv() 15 | 16 | 17 | 18 | # Set up constants 19 | current_filepath = Path(__file__).stem 20 | 21 | SMTP_PORT = 587 22 | SMTP_HOST_SERVER = "smtp.gmail.com" 23 | CURRENT_TIMESTAMP = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 24 | EMAIL_ADDRESS = os.getenv("SENDER") 25 | EMAIL_PASSWORD = os.getenv("EMAIL_PASSWORD") 26 | SENDER = "Postgres Data Warehouse Program - SDW" 27 | RECIPIENT = os.getenv("RECIPIENT") 28 | 29 | 30 | L0_LOG_DIRECTORY = os.getenv("L0_LOG_DIRECTORY") 31 | L1_LOG_DIRECTORY = os.getenv("L1_LOG_DIRECTORY") 32 | L2_LOG_DIRECTORY = os.getenv("L2_LOG_DIRECTORY") 33 | L3_LOG_DIRECTORY = os.getenv("L3_LOG_DIRECTORY") 34 | L4_LOG_DIRECTORY = os.getenv("L4_LOG_DIRECTORY") 35 | 36 | body_main_subject = "loading data from source systems into the raw tables of the Postgres data warehouse" 37 | body = f"""Hi Stephen, 38 | 39 | See attached the logs for {body_main_subject}. 40 | 41 | Regards, 42 | {SENDER} 43 | 44 | """ 45 | 46 | 47 | # Create function for getting the directory paths for log files 48 | def get_log_filepaths(log_directory): 49 | log_filepaths = [] 50 | for root, directories, log_files in os.walk(log_directory): 51 | for filename in log_files: 52 | log_filepath = os.path.join(root, filename) 53 | log_filepaths.append(log_filepath) 54 | return log_filepaths 55 | 56 | 57 | # Create function for attaching log files to email 58 | def attach_log_files_to_email(message, log_filepaths): 59 | for log_file in log_filepaths: 60 | with open(log_file, 'rb') as file: 61 | log_attachment = MIMEBase('application', 'octet-stream') 62 | log_attachment.set_payload(file.read()) 63 | encoders.encode_base64(log_attachment) 64 | log_attachment.add_header('Content-Disposition', f'attachment; filename="{os.path.basename(log_file)}"') 65 | message.attach(log_attachment) 66 | 67 | 68 | 69 | 70 | # ===================================== SETTING UP LOG FILE ATTACHMENTS ===================================== 71 | 72 | 73 | # Get directory paths for log files 74 | raw_layer_log_directory = get_log_filepaths(L1_LOG_DIRECTORY) 75 | 76 | log_file_counter = 0 77 | for log_file in raw_layer_log_directory: 78 | log_file_counter += 1 79 | print('') 80 | print(f'Log file {log_file_counter}: {log_file} ') 81 | 82 | 83 | 84 | # ===================================== SETTING UP EMAIL MESSAGE ===================================== 85 | 86 | # Set up constants for email 87 | message = MIMEMultipart() 88 | message["From"] = SENDER 89 | message["To"] = RECIPIENT 90 | message["Subject"] = f"L1 - Raw Layer Log Files - {CURRENT_TIMESTAMP}" 91 | 92 | 93 | # Add body to the email message 94 | message.attach(MIMEText(body, "plain")) 95 | 96 | 97 | # Attach log files to email 98 | attach_log_files_to_email(message, raw_layer_log_directory) 99 | 100 | 101 | 102 | # ===================================== SENDING EMAIL MESSAGE ===================================== 103 | 104 | def send_email(): 105 | with smtplib.SMTP(host=SMTP_HOST_SERVER, port=SMTP_PORT) as smtp: 106 | smtp.ehlo() 107 | smtp.starttls() 108 | smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) 109 | smtp.send_message(message) 110 | print('Message sent successfully. ') 111 | print() 112 | 113 | 114 | send_email() 115 | 116 | -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/L1_raw_layer_approach.md: -------------------------------------------------------------------------------- 1 | # Approach: Raw Layer 2 | 3 | 4 | ## Acceptance criteria 5 | 6 | - The raw layer must extract data from various source systems with no issues 7 | - An internal program must log the events for the extraction process (and other processes supporting extraction) to a file and console 8 | - Sensitive fields and records must be highlighted to determine the appropriate treatment required 9 | - Data profiling metrics must be logged to a file and console to understand the raw data’s properties and structure during each load process 10 | 11 | 12 | ### Completion 13 | 14 | Once the acceptance criteria is 100% covered and satisfied, the raw layer’s tasks are officially completed. 15 | 16 | 17 | 18 | 19 | 20 | 21 | ## Micro tasks 22 | 23 | 24 | ### Table 25 | 26 | | task_no | layer_name | task | task_type| completion_status | 27 | | ----- | -------------- | --------------- | --------------- | --------------- | 28 | | DWH-001 | RAW | Load source data for “Customer information” into raw table `raw_customer_data` | LOAD_TO_RAW | NOT_STARTED | 29 | | DWH-002 | RAW | Load source data for “Flight schedules” into raw table `raw_flight_schedules_data`| LOAD_TO_RAW | NOT_STARTED | 30 | | DWH-003 | RAW | Load source data for “Customer feedback” into raw table `raw_customer_feedback_data`| LOAD_TO_RAW | NOT_STARTED | 31 | | DWH-004 | RAW | Load source data for “Ticket price data” into raw table `raw_ticket_price_data`| LOAD_TO_RAW | NOT_STARTED | 32 | | DWH-005 | RAW | Load source data for “Raw customer demographic data” into raw table `raw_customer_demographic_data`| LOAD_TO_RAW | NOT_STARTED | 33 | | DWH-006 | RAW | Load source data for “Flight destination information” into raw table `raw_flight_destination_data`| LOAD_TO_RAW | NOT_STARTED | 34 | | DWH-007 | RAW | Load source data for “Flight ticket sales” into raw table `raw_flight_ticket_sales_data`| LOAD_TO_RAW | NOT_STARTED | 35 | | DWH-008 | RAW | Load source data for “Flight Promotion” into raw table `raw_flight_promotion_data`| LOAD_TO_RAW | NOT_STARTED | 36 | | DWH-009 | RAW | Load source data for “Holiday data” into raw table `raw_holiday_data`| LOAD_TO_RAW | NOT_STARTED | 37 | | DWH-010 | RAW | Load source data for “Airline data” into raw table `raw_airline_data`| LOAD_TO_RAW | NOT_STARTED | 38 | | DWH-011 | RAW | Load source data for “Sales agent data” into raw table `raw_sales_agent_data`| LOAD_TO_RAW | NOT_STARTED | 39 | | DWH-012 | RAW | Load source data for “Ticket sales data” into raw table `raw_ticket_sales_data`| LOAD_TO_RAW | NOT_STARTED | 40 | | DWH-013 | RAW | Load source data for “Flight bookings data” into raw table `raw_flight_bookings_data`| LOAD_TO_RAW | NOT_STARTED | 41 | | DWH-014 | RAW | Load source data for “Fight destination revenue” into raw table `raw_flight_destination_revenue_data`| LOAD_TO_RAW | NOT_STARTED | 42 | | DWH-015 | RAW | Mark the sensitive fields (PII/PHI) in `raw_customer_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 43 | | DWH-016 | RAW | Mark the sensitive fields (PII/PHI) in `raw_flight_schedules_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 44 | | DWH-017 | RAW | Mark the sensitive fields (PII/PHI) in `raw_customer_feedback_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 45 | | DWH-018 | RAW | Mark the sensitive fields (PII/PHI) in `raw_ticket_price_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 46 | | DWH-019 | RAW | Mark the sensitive fields (PII/PHI) in `raw_customer_demographic_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 47 | | DWH-020 | RAW | Mark the sensitive fields (PII/PHI) in `raw_flight_destination_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 48 | | DWH-021 | RAW | Mark the sensitive fields (PII/PHI) in `raw_flight_ticket_sales_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 49 | | DWH-022 | RAW | Mark the sensitive fields (PII/PHI) in `raw_flight_promotion_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 50 | | DWH-023 | RAW | Mark the sensitive fields (PII/PHI) in `raw_airline_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 51 | | DWH-024 | RAW | Mark the sensitive fields (PII/PHI) in `raw_sales_agent_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 52 | | DWH-025 | RAW | Mark the sensitive fields (PII/PHI) in `raw_ticket_sales_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 53 | | DWH-026 | RAW | Mark the sensitive fields (PII/PHI) in `raw_flight_bookings_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 54 | | DWH-027 | RAW | Mark the sensitive fields (PII/PHI) in `raw_flight_destination_revenue_data`| SENSITIVE_DATA_MARKING | NOT_STARTED | 55 | | DWH-028 | RAW | Add event logs to `raw_customer_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 56 | | DWH-029 | RAW | Add event logs to `raw_flight_schedules_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 57 | | DWH-030 | RAW | Add event logs to `raw_customer_feedback_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 58 | | DWH-031 | RAW | Add event logs to `raw_ticket_price_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 59 | | DWH-032 | RAW | Add event logs to `raw_customer_demographic_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 60 | | DWH-033 | RAW | Add event logs to `raw_flight_destination_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 61 | | DWH-034 | RAW | Add event logs to `raw_flight_ticket_sales_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 62 | | DWH-035 | RAW | Add event logs to `raw_flight_promotion_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 63 | | DWH-036 | RAW | Add event logs to `raw_airline_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 64 | | DWH-037 | RAW | Add event logs to `raw_sales_agent_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 65 | | DWH-038 | RAW | Add event logs to `raw_ticket_sales_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 66 | | DWH-039 | RAW | Add event logs to `raw_flight_bookings_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 67 | | DWH-040 | RAW | Add event logs to `raw_flight_destination_revenue_data`| LOGGING_RAW_LEVEL_EVENTS | NOT_STARTED | 68 | | DWH-041 | RAW | Design and run data profiling checks on `raw_customer_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 69 | | DWH-042 | RAW | Design and run data profiling checks on `raw_flight_schedules_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 70 | | DWH-043 | RAW | Design and run data profiling checks on `raw_customer_feedback_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 71 | | DWH-044 | RAW | Design and run data profiling checks on `raw_ticket_price_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 72 | | DWH-045 | RAW | Design and run data profiling checks on `raw_customer_demographic_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 73 | | DWH-046 | RAW | Design and run data profiling checks on `raw_flight_destination_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 74 | | DWH-047 | RAW | Design and run data profiling checks on `raw_flight_ticket_sales_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 75 | | DWH-048 | RAW | Design and run data profiling checks on `raw_flight_promotion_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 76 | | DWH-049 | RAW | Design and run data profiling checks on `raw_airline_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 77 | | DWH-050 | RAW | Design and run data profiling checks on `raw_sales_agent_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 78 | | DWH-051 | RAW | Design and run data profiling checks on `raw_ticket_sales_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 79 | | DWH-052 | RAW | Design and run data profiling checks on `raw_flight_bookings_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 80 | | DWH-053 | RAW | Design and run data profiling checks on `raw_flight_destination_revenue_data`| DATA_PROFILING_CHECKS | NOT_STARTED | 81 | 82 | 83 | 84 | | DWH-054 | RAW | Load source data for “Customer preferences data” into raw table `raw_customer_preferences_data`| LOAD_TO_RAW | NOT_STARTED | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | ### Extracting source data to raw tables 95 | 96 | - Load source data for “Customer information” into raw table `raw_customer_data` 97 | - Load source data for “Flight schedules” into raw table `raw_flight_schedules_data` 98 | - Load source data for “Customer feedback” into raw table `raw_customer_feedback_data` 99 | - Load source data for “Ticket price data” into raw table `raw_ticket_price_data` 100 | - Load source data for “Raw customer demographic data” into raw table `raw_customer_demographic_data` 101 | - Load source data for “Flight destination information” into raw table `raw_flight_destination_data` 102 | - Load source data for “Flight ticket sales” into raw table `raw_flight_ticket_sales_data` 103 | - Load source data for “Flight Promotion” into raw table `raw_flight_promotion_data` 104 | - Load source data for “Holiday data” into raw table `raw_holiday_data` 105 | - Load source data for “Airline data” into raw table `raw_airline_data` 106 | - Load source data for “Sales agent data” into raw table `raw_sales_agent_data` 107 | - Load source data for “Ticket sales data” into raw table `raw_ticket_sales_data` 108 | - Load source data for “Flight bookings data” into raw table `raw_flight_bookings_data` 109 | - Load source data for “Fight destination revenue” into raw table `raw_flight_destination_revenue_data` 110 | 111 | ### Highlighting sensitive fields 112 | 113 | - Mark the sensitive fields (PII/PHI) in `raw_customer_data` 114 | - Mark the sensitive fields (PII/PHI) in `raw_flight_schedules_data` 115 | - Mark the sensitive fields (PII/PHI) in `raw_customer_feedback_data` 116 | - Mark the sensitive fields (PII/PHI) in `raw_ticket_price_data` 117 | - Mark the sensitive fields (PII/PHI) in `raw_customer_demographic_data` 118 | - Mark the sensitive fields (PII/PHI) in `raw_flight_destination_data` 119 | - Mark the sensitive fields (PII/PHI) in `raw_flight_ticket_sales_data` 120 | - Mark the sensitive fields (PII/PHI) in `raw_flight_promotion_data` 121 | - Mark the sensitive fields (PII/PHI) in `raw_airline_data` 122 | - Mark the sensitive fields (PII/PHI) in `raw_sales_agent_data` 123 | - Mark the sensitive fields (PII/PHI) in `raw_ticket_sales_data` 124 | - Mark the sensitive fields (PII/PHI) in `raw_flight_bookings_data` 125 | - Mark the sensitive fields (PII/PHI) in `raw_flight_destination_revenue_data` 126 | 127 | 128 | ### Adding event logs 129 | 130 | - Add event logs to `raw_customer_data` 131 | - Add event logs to `raw_flight_schedules_data` 132 | - Add event logs to `raw_customer_feedback_data` 133 | - Add event logs to `raw_ticket_price_data` 134 | - Add event logs to `raw_customer_demographic_data` 135 | - Add event logs to `raw_flight_destination_data` 136 | - Add event logs to `raw_flight_ticket_sales_data` 137 | - Add event logs to `raw_flight_promotion_data` 138 | - Add event logs to `raw_airline_data` 139 | - Add event logs to `raw_sales_agent_data` 140 | - Add event logs to `raw_ticket_sales_data` 141 | - Add event logs to `raw_flight_bookings_data` 142 | - Add event logs to `raw_flight_destination_revenue_data` 143 | 144 | 145 | ### Run data profiling checks 146 | 147 | - Design and run data profiling checks on `raw_customer_data` 148 | - Design and run data profiling checks on `raw_flight_schedules_data` 149 | - Design and run data profiling checks on `raw_customer_feedback_data` 150 | - Design and run data profiling checks on `raw_ticket_price_data` 151 | - Design and run data profiling checks on `raw_customer_demographic_data` 152 | - Design and run data profiling checks on `raw_flight_destination_data` 153 | - Design and run data profiling checks on `raw_flight_ticket_sales_data` 154 | - Design and run data profiling checks on `raw_flight_promotion_data` 155 | - Design and run data profiling checks on `raw_airline_data` 156 | - Design and run data profiling checks on `raw_sales_agent_data` 157 | - Design and run data profiling checks on `raw_ticket_sales_data` 158 | - Design and run data profiling checks on `raw_flight_bookings_data` 159 | - Design and run data profiling checks on `raw_flight_destination_revenue_data` 160 | 161 | 162 | 163 | 164 | 165 | ## Additional notes 166 | 167 | 168 | ... -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/L1_email_bot.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/L1_email_bot.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_accommodation_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_accommodation_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_customer_feedbacks_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_customer_feedbacks_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_customer_info_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_customer_info_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_destinations_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_destinations_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_promotion_deals_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_promotion_deals_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_schedules_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_schedules_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_ticket_sales_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_flight_ticket_sales_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_sales_agents_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_sales_agents_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/__pycache__/raw_ticket_prices_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/__pycache__/raw_ticket_prices_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L1_raw_layer/raw-to-staging-diagram_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L1_raw_layer/raw-to-staging-diagram_2.png -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/L2_email_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from datetime import datetime 4 | from email.mime.multipart import MIMEMultipart 5 | from email.mime.base import MIMEBase 6 | from email.mime.text import MIMEText 7 | from email.utils import COMMASPACE 8 | from email import encoders 9 | from pathlib import Path 10 | from dotenv import load_dotenv 11 | 12 | 13 | # Load environment variables from .env 14 | load_dotenv() 15 | 16 | 17 | 18 | # Set up constants 19 | current_filepath = Path(__file__).stem 20 | 21 | SMTP_PORT = 587 22 | SMTP_HOST_SERVER = "smtp.gmail.com" 23 | CURRENT_TIMESTAMP = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 24 | EMAIL_ADDRESS = os.getenv("SENDER") 25 | EMAIL_PASSWORD = os.getenv("EMAIL_PASSWORD") 26 | SENDER = "Postgres Data Warehouse Program - SDW" 27 | RECIPIENT = os.getenv("RECIPIENT") 28 | 29 | 30 | L0_LOG_DIRECTORY = os.getenv("L0_LOG_DIRECTORY") 31 | L1_LOG_DIRECTORY = os.getenv("L1_LOG_DIRECTORY") 32 | L2_LOG_DIRECTORY = os.getenv("L2_LOG_DIRECTORY") 33 | L3_LOG_DIRECTORY = os.getenv("L3_LOG_DIRECTORY") 34 | L4_LOG_DIRECTORY = os.getenv("L4_LOG_DIRECTORY") 35 | 36 | body_main_subject = "loading data from raw tables into the staging tables of the Postgres data warehouse" 37 | body = f"""Hi Stephen, 38 | 39 | See attached the logs for {body_main_subject}. 40 | 41 | Regards, 42 | {SENDER} 43 | 44 | """ 45 | 46 | 47 | # Create function for getting the directory paths for log files 48 | def get_log_filepaths(log_directory): 49 | log_filepaths = [] 50 | for root, directories, log_files in os.walk(log_directory): 51 | for filename in log_files: 52 | log_filepath = os.path.join(root, filename) 53 | log_filepaths.append(log_filepath) 54 | return log_filepaths 55 | 56 | 57 | # Create function for attaching log files to email 58 | def attach_log_files_to_email(message, log_filepaths): 59 | for log_file in log_filepaths: 60 | with open(log_file, 'rb') as file: 61 | log_attachment = MIMEBase('application', 'octet-stream') 62 | log_attachment.set_payload(file.read()) 63 | encoders.encode_base64(log_attachment) 64 | log_attachment.add_header('Content-Disposition', f'attachment; filename="{os.path.basename(log_file)}"') 65 | message.attach(log_attachment) 66 | 67 | 68 | 69 | 70 | # ===================================== SETTING UP LOG FILE ATTACHMENTS ===================================== 71 | 72 | 73 | # Get directory paths for log files 74 | staging_layer_log_directory = get_log_filepaths(L2_LOG_DIRECTORY) 75 | 76 | log_file_counter = 0 77 | for log_file in staging_layer_log_directory: 78 | log_file_counter += 1 79 | print('') 80 | print(f'Log file {log_file_counter}: {log_file} ') 81 | 82 | 83 | 84 | # ===================================== SETTING UP EMAIL MESSAGE ===================================== 85 | 86 | # Set up constants for email 87 | message = MIMEMultipart() 88 | message["From"] = SENDER 89 | message["To"] = RECIPIENT 90 | message["Subject"] = f"L2 - Staging Layer Log Files - {CURRENT_TIMESTAMP}" 91 | 92 | 93 | # Add body to the email message 94 | message.attach(MIMEText(body, "plain")) 95 | 96 | 97 | # Attach log files to email 98 | attach_log_files_to_email(message, staging_layer_log_directory) 99 | 100 | 101 | 102 | # ===================================== SENDING EMAIL MESSAGE ===================================== 103 | 104 | def send_email(): 105 | with smtplib.SMTP(host=SMTP_HOST_SERVER, port=SMTP_PORT) as smtp: 106 | smtp.ehlo() 107 | smtp.starttls() 108 | smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) 109 | smtp.send_message(message) 110 | print('Message sent successfully. ') 111 | print() 112 | 113 | 114 | send_email() 115 | 116 | 117 | -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/L2_email_bot.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/L2_email_bot.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_accommodation_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_accommodation_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_customer_feedbacks_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_customer_feedbacks_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_customer_info_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_customer_info_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_destinations_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_destinations_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_promotion_deals_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_promotion_deals_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_schedules_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_schedules_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_ticket_sales_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_flight_ticket_sales_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_sales_agents_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_sales_agents_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/__pycache__/stg_ticket_prices_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/__pycache__/stg_ticket_prices_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_accommodation_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_accommodation_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_customer_feedbacks_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_customer_feedbacks_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_customer_info_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_customer_info_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_destinations_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_destinations_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_promotion_deals_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_promotion_deals_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_schedules_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_schedules_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_ticket_sales_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_flight_ticket_sales_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_sales_agents_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_sales_agents_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_ticket_prices_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/dev/__pycache__/stg_ticket_prices_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/prod/__pycache__/create_prod_env.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/prod/__pycache__/create_prod_env.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/prod/__pycache__/create_stg_prod_env.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/prod/__pycache__/create_stg_prod_env.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/prod/create_stg_prod_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | import random 5 | import psycopg2 6 | import pandas as pd 7 | import configparser 8 | from pathlib import Path 9 | import logging, coloredlogs 10 | from datetime import datetime 11 | 12 | # ================================================ LOGGER ================================================ 13 | 14 | 15 | # Set up root root_logger 16 | root_logger = logging.getLogger(__name__) 17 | root_logger.setLevel(logging.DEBUG) 18 | 19 | 20 | # Set up formatter for logs 21 | file_handler_log_formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s ') 22 | console_handler_log_formatter = coloredlogs.ColoredFormatter(fmt = '%(message)s', level_styles=dict( 23 | debug = dict (color = 'white'), 24 | info = dict (color = 'green'), 25 | warning = dict (color = 'cyan'), 26 | error = dict (color = 'red', bold = True, bright = True), 27 | critical = dict (color = 'black', bold = True, background = 'red') 28 | ), 29 | 30 | field_styles=dict( 31 | messages = dict (color = 'white') 32 | ) 33 | ) 34 | 35 | 36 | # Set up file handler object for logging events to file 37 | current_filepath = Path(__file__).stem 38 | file_handler = logging.FileHandler('logs/L2_staging_layer/prod/' + current_filepath + '.log', mode='w') 39 | file_handler.setFormatter(file_handler_log_formatter) 40 | 41 | 42 | # Set up console handler object for writing event logs to console in real time (i.e. streams events to stderr) 43 | console_handler = logging.StreamHandler() 44 | console_handler.setFormatter(console_handler_log_formatter) 45 | 46 | 47 | # Add the file handler 48 | root_logger.addHandler(file_handler) 49 | 50 | 51 | # Only add the console handler if the script is running directly from this location 52 | if __name__=="__main__": 53 | root_logger.addHandler(console_handler) 54 | 55 | 56 | 57 | 58 | # ================================================ CONFIG ================================================ 59 | config = configparser.ConfigParser() 60 | 61 | # Use the local config file from the local machine 62 | path = os.path.abspath('dwh_pipelines/local_config.ini') 63 | config.read(path) 64 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 65 | 66 | host = config['travel_data_filepath']['HOST'] 67 | port = config['travel_data_filepath']['PORT'] 68 | database = config['travel_data_filepath']['STAGING_DB'] 69 | username = config['travel_data_filepath']['USERNAME'] 70 | password = config['travel_data_filepath']['PASSWORD'] 71 | 72 | postgres_connection = None 73 | cursor = None 74 | 75 | 76 | 77 | # Connect to the staging instance of the Postgres data warehouse 78 | 79 | postgres_connection = psycopg2.connect( 80 | host = host, 81 | port = port, 82 | dbname = database, 83 | user = username, 84 | password = password, 85 | ) 86 | postgres_connection.set_session(autocommit=True) 87 | 88 | 89 | 90 | def create_prod_environment_for_staging(): 91 | # Set up constants 92 | CURRENT_TIMESTAMP = datetime.now() 93 | dev_schema_name = 'dev' 94 | prod_schema_name = 'prod' 95 | active_db_name = database 96 | data_warehouse_layer = 'STAGING' 97 | 98 | 99 | # Create a cursor object to execute the PG-SQL commands 100 | cursor = postgres_connection.cursor() 101 | 102 | 103 | 104 | # Validate the Postgres database connection 105 | if postgres_connection.closed == 0: 106 | root_logger.debug(f"") 107 | root_logger.info("=================================================================================") 108 | root_logger.info(f"CONNECTION SUCCESS: Managed to connect successfully to the {active_db_name} database!!") 109 | root_logger.info(f"Connection details: {postgres_connection.dsn} ") 110 | root_logger.info("=================================================================================") 111 | root_logger.debug("") 112 | 113 | elif postgres_connection.closed != 0: 114 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 115 | 116 | 117 | 118 | # Set up SQL statements for schema creation and validation check 119 | try: 120 | 121 | create_schema = f''' CREATE SCHEMA IF NOT EXISTS {prod_schema_name}; 122 | ''' 123 | 124 | check_if_schema_exists = f''' SELECT schema_name from information_schema.schemata WHERE schema_name= '{prod_schema_name}'; 125 | ''' 126 | 127 | 128 | # Create schema in Postgres 129 | CREATING_SCHEMA_PROCESSING_START_TIME = time.time() 130 | cursor.execute(create_schema) 131 | root_logger.info("") 132 | root_logger.info(f"Successfully created '{prod_schema_name}' schema. ") 133 | root_logger.info("") 134 | CREATING_SCHEMA_PROCESSING_END_TIME = time.time() 135 | 136 | 137 | CREATING_SCHEMA_VAL_CHECK_START_TIME = time.time() 138 | cursor.execute(check_if_schema_exists) 139 | CREATING_SCHEMA_VAL_CHECK_END_TIME = time.time() 140 | 141 | 142 | 143 | sql_result = cursor.fetchone()[0] 144 | if sql_result: 145 | root_logger.debug(f"") 146 | root_logger.info(f"=================================================================================================") 147 | root_logger.info(f"SCHEMA CREATION SUCCESS: Managed to create {prod_schema_name} schema in {active_db_name} ") 148 | root_logger.info(f"Schema name in Postgres: {sql_result} ") 149 | root_logger.info(f"SQL Query for validation check: {check_if_schema_exists} ") 150 | root_logger.info(f"=================================================================================================") 151 | root_logger.debug(f"") 152 | 153 | else: 154 | root_logger.debug(f"") 155 | root_logger.error(f"=================================================================================================") 156 | root_logger.error(f"SCHEMA CREATION FAILURE: Unable to create schema for {active_db_name}...") 157 | root_logger.info(f"SQL Query for validation check: {check_if_schema_exists} ") 158 | root_logger.error(f"=================================================================================================") 159 | root_logger.debug(f"") 160 | 161 | # postgres_connection.commit() 162 | 163 | except Exception as e: 164 | print(e) 165 | 166 | 167 | # Get all the tables from DEV environment 168 | try: 169 | root_logger.debug(f"") 170 | root_logger.debug(f"Now creating '{prod_schema_name}' environment ....") 171 | root_logger.debug(f"") 172 | sql_query = f""" SELECT table_name FROM information_schema.tables WHERE table_schema = '{dev_schema_name}' AND table_name LIKE '%stg%' 173 | """ 174 | cursor.execute(sql_query) 175 | 176 | sql_results = cursor.fetchall() 177 | no_of_sql_results = len(sql_results) 178 | root_logger.debug(f'No of results: {no_of_sql_results} ') 179 | 180 | 181 | for table in sql_results: 182 | table_name = table[0] 183 | root_logger.info(f"") 184 | root_logger.info(f"Now creating '{table_name}' table in production environment ...") 185 | # root_logger.info(f"") 186 | sql_query = f""" CREATE TABLE IF NOT EXISTS {prod_schema_name}.{table_name} as SELECT * FROM {dev_schema_name}.{table_name} 187 | """ 188 | cursor.execute(sql_query) 189 | # root_logger.info(f"") 190 | root_logger.info(f"Successfully created '{table_name}' table in production environment ") 191 | root_logger.info(f"") 192 | 193 | 194 | # postgres_connection.commit() 195 | root_logger.debug(f"") 196 | root_logger.debug(f"Successfully created '{prod_schema_name}' environment. ") 197 | root_logger.debug(f"") 198 | 199 | 200 | 201 | except Exception as e: 202 | print(e) 203 | 204 | 205 | finally: 206 | 207 | # Close the cursor if it exists 208 | if cursor is not None: 209 | cursor.close() 210 | root_logger.debug("") 211 | root_logger.debug("Cursor closed successfully.") 212 | 213 | # Close the database connection to Postgres if it exists 214 | if postgres_connection is not None: 215 | postgres_connection.close() 216 | # root_logger.debug("") 217 | root_logger.debug("Session connected to Postgres database closed.") 218 | 219 | 220 | 221 | create_prod_environment_for_staging() -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/staging-to-semantic-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/staging-to-semantic-diagram.png -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_accommodation_bookings_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_accommodation_bookings_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_customer_feedbacks_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_customer_feedbacks_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_customer_info_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_customer_info_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_bookings_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_bookings_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_destinations_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_destinations_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_promotion_deals_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_promotion_deals_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_schedules_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_schedules_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_ticket_sales_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_flight_ticket_sales_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_sales_agents_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_sales_agents_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_ticket_prices_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L2_staging_layer/tests/__pycache__/test_stg_ticket_prices_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/test_stg_customer_feedbacks_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import pytest 5 | import psycopg2 6 | import configparser 7 | from datetime import datetime 8 | 9 | 10 | 11 | 12 | # ================================================ CONFIG ================================================ 13 | 14 | # Add a flag/switch indicating whether Airflow is in use or not 15 | USING_AIRFLOW = False 16 | 17 | 18 | 19 | # Create a config file for storing environment variables 20 | config = configparser.ConfigParser() 21 | if USING_AIRFLOW: 22 | 23 | # Use the airflow config file from the airflow container 24 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 25 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 26 | 27 | host = config['postgres_airflow_config']['HOST'] 28 | port = config['postgres_airflow_config']['PORT'] 29 | database = config['postgres_airflow_config']['STAGING_DB'] 30 | username = config['postgres_airflow_config']['USERNAME'] 31 | password = config['postgres_airflow_config']['PASSWORD'] 32 | 33 | postgres_connection = None 34 | cursor = None 35 | 36 | 37 | else: 38 | 39 | # Use the local config file from the local machine 40 | path = os.path.abspath('dwh_pipelines/local_config.ini') 41 | config.read(path) 42 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 43 | 44 | host = config['travel_data_filepath']['HOST'] 45 | port = config['travel_data_filepath']['PORT'] 46 | database = config['travel_data_filepath']['STAGING_DB'] 47 | username = config['travel_data_filepath']['USERNAME'] 48 | password = config['travel_data_filepath']['PASSWORD'] 49 | 50 | postgres_connection = None 51 | cursor = None 52 | 53 | 54 | 55 | # Connect to the Postgres database 56 | try: 57 | pgsql_connection = psycopg2.connect( 58 | host = host, 59 | port = port, 60 | dbname = database, 61 | user = username, 62 | password = password, 63 | ) 64 | 65 | 66 | # Create a cursor object to execute the PG-SQL commands 67 | cursor = pgsql_connection.cursor() 68 | 69 | 70 | except psycopg2.Error: 71 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 72 | 73 | 74 | 75 | # Define the database, schema and table names 76 | 77 | table_name = 'stg_customer_feedbacks_tbl' 78 | schema_name = 'dev' 79 | database_name = database 80 | 81 | 82 | 83 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 84 | 85 | 86 | """ Test the connection to the Postgres database is successful or not """ 87 | 88 | def test_database_connection(): 89 | 90 | # Assert the existence of a valid connection to the database (i.e. not None) 91 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 92 | 93 | 94 | 95 | 96 | 97 | 98 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 99 | 100 | 101 | """ Verify the staging schema exists in the Postgres staging database """ 102 | 103 | 104 | 105 | def test_schema_existence(): 106 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 107 | """ 108 | cursor.execute(sql_query) 109 | 110 | sql_results = cursor.fetchall() 111 | schemas = [schema[0] for schema in sql_results] 112 | 113 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 114 | 115 | 116 | 117 | 118 | 119 | 120 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 121 | 122 | """ Verify the columns of this table exists in the Postgres staging database """ 123 | 124 | 125 | 126 | def test_columns_existence(): 127 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 128 | """ 129 | cursor.execute(sql_query) 130 | 131 | sql_results = cursor.fetchall() 132 | actual_columns = [column[0] for column in sql_results] 133 | 134 | expected_columns = ['feedback_id', 135 | 'customer_id', 136 | 'flight_booking_id', 137 | 'feedback_date', 138 | 'feedback_text', 139 | 'created_at', 140 | 'updated_at', 141 | 'source_system', 142 | 'source_file', 143 | 'load_timestamp', 144 | 'dwh_layer' 145 | ] 146 | 147 | for expected_column in expected_columns: 148 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 149 | 150 | 151 | 152 | 153 | 154 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 155 | 156 | 157 | """ Check if the active table is in the Postgres staging database """ 158 | 159 | 160 | def test_table_existence(): 161 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 162 | cursor.execute(sql_query) 163 | sql_result = cursor.fetchone() 164 | 165 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 166 | 167 | 168 | 169 | 170 | 171 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 172 | 173 | 174 | """ Test if each column is mapped to the expected data type in Postgres """ 175 | 176 | 177 | def test_column_data_types(): 178 | 179 | # Create a dictionary that specifies the expected data types for each column 180 | expected_data_types = { 181 | 'feedback_id' : 'uuid', 182 | 'customer_id' : 'uuid', 183 | 'flight_booking_id' : 'uuid', 184 | 'feedback_date' : 'date', 185 | 'feedback_text' : 'text', 186 | "created_at" : "timestamp with time zone", 187 | "updated_at" : "timestamp with time zone", 188 | "source_system" : "character varying", 189 | "source_file" : "character varying", 190 | "load_timestamp" : "timestamp without time zone", 191 | "dwh_layer" : "character varying" 192 | 193 | } 194 | 195 | 196 | 197 | # Use SQL to extract the column names and their data types 198 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 199 | """ 200 | cursor.execute(sql_query) 201 | 202 | sql_results = cursor.fetchall() 203 | 204 | for column_name, actual_data_type in sql_results: 205 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 206 | 207 | 208 | 209 | 210 | 211 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 212 | 213 | 214 | """ Check if there are any empty values present in your table """ 215 | 216 | def test_empty_values_in_table(): 217 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 218 | """ 219 | cursor.execute(sql_query) 220 | sql_results = cursor.fetchall() 221 | 222 | row_no = 0 223 | for record in sql_results: 224 | row_no +=1 225 | for cell_value in record: 226 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 227 | 228 | 229 | 230 | 231 | 232 | 233 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 234 | 235 | """ Check if there are any NULL values present in your table """ 236 | 237 | def test_null_values_in_table(): 238 | 239 | # Get list of columns from table 240 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 241 | """) 242 | columns = cursor.fetchall() 243 | 244 | 245 | for column in columns: 246 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 247 | cursor.execute(sql_query) 248 | sql_result = cursor.fetchone() 249 | 250 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | # ====================================== TEST 8: DATE FORMATTING CHECK ====================================== 259 | 260 | 261 | """ Check the date columns contain values in the 'yyyy-mm-dd' format """ 262 | 263 | def test_date_formatting_constraint(): 264 | expected_date_format = r"^\d{4}-\d{2}-\d{2}$" 265 | data_type = 'date' 266 | 267 | sql_query_1 = f''' SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}' AND data_type = '{data_type}' ''' 268 | cursor.execute(sql_query_1) 269 | 270 | sql_results_1 = cursor.fetchall() 271 | date_columns = [sql_result[0] for sql_result in sql_results_1] 272 | 273 | for date_column in date_columns: 274 | sql_query_2 = f""" SELECT {date_column} 275 | FROM {schema_name}.{table_name} 276 | """ 277 | cursor.execute(sql_query_2) 278 | sql_results_2 = cursor.fetchall() 279 | for sql_result in sql_results_2: 280 | date_value = sql_result[0].strftime("%Y-%m-%d") 281 | assert re.match(expected_date_format, date_value) is not None, f"Invalid date detected - date values should be in 'yyyy-mm-dd' format." 282 | 283 | 284 | 285 | 286 | # ====================================== TEST 9: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 287 | 288 | """ Test all the ID columns in the table contain 36 characters in length """ 289 | 290 | def test_id_char_length_constraint(): 291 | expected_id_char_length = 36 292 | sql_results = cursor.fetchall() 293 | 294 | 295 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 296 | """ 297 | cursor.execute(sql_query) 298 | 299 | sql_results = cursor.fetchall() 300 | 301 | 302 | # Assert the number of characters for the id column is equal to 36 303 | for sql_result in sql_results: 304 | id_column = sql_result[0] 305 | actual_id_length = len(id_column) 306 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 307 | 308 | 309 | 310 | 311 | # ====================================== TEST 10: DUPLICATES CHECK ====================================== 312 | 313 | 314 | """ Test the number of duplicate records appearing in the Postgres table """ 315 | 316 | def test_duplicate_records_count(): 317 | column_name = "feedback_id" 318 | sql_query = f""" SELECT {column_name}, 319 | COUNT (*) 320 | FROM {schema_name}.{table_name} 321 | GROUP BY {column_name} 322 | HAVING COUNT(*) > 1 323 | ; 324 | """ 325 | cursor.execute(sql_query) 326 | 327 | duplicates = cursor.fetchall() 328 | total_no_of_duplicates = len(duplicates) 329 | 330 | # Assert the number of uniqueness constraints for the table specified is at least 1 331 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 332 | 333 | def run_tests(): 334 | test_filepath = os.path.abspath('dwh_pipelines/L2_staging_layer/tests/test_stg_customer_feedbacks_tbl.py') 335 | test_result = pytest.main([test_filepath]) 336 | return test_result 337 | 338 | 339 | 340 | if __name__ == "__main__": 341 | # Run DQ tests 342 | test_result = run_tests() 343 | 344 | # Create DQ reports in HTML format 345 | from pathlib import Path 346 | import webbrowser 347 | file_path = os.path.abspath(__file__) 348 | current_filepath = Path(__file__).stem 349 | html_report_path = f"{current_filepath}.html" 350 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 351 | 352 | # Open DQ reports in browser 353 | # dq_report_url = Path.cwd() / html_report_path 354 | # webbrowser.open(dq_report_url.as_uri()) 355 | sys.exit() 356 | -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/test_stg_flight_destinations_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import pytest 5 | import psycopg2 6 | import configparser 7 | from datetime import datetime 8 | 9 | 10 | 11 | 12 | # ================================================ CONFIG ================================================ 13 | 14 | # Add a flag/switch indicating whether Airflow is in use or not 15 | USING_AIRFLOW = False 16 | 17 | 18 | 19 | # Create a config file for storing environment variables 20 | config = configparser.ConfigParser() 21 | if USING_AIRFLOW: 22 | 23 | # Use the airflow config file from the airflow container 24 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 25 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 26 | 27 | host = config['postgres_airflow_config']['HOST'] 28 | port = config['postgres_airflow_config']['PORT'] 29 | database = config['postgres_airflow_config']['STAGING_DB'] 30 | username = config['postgres_airflow_config']['USERNAME'] 31 | password = config['postgres_airflow_config']['PASSWORD'] 32 | 33 | postgres_connection = None 34 | cursor = None 35 | 36 | 37 | else: 38 | 39 | # Use the local config file from the local machine 40 | path = os.path.abspath('dwh_pipelines/local_config.ini') 41 | config.read(path) 42 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 43 | 44 | host = config['travel_data_filepath']['HOST'] 45 | port = config['travel_data_filepath']['PORT'] 46 | database = config['travel_data_filepath']['STAGING_DB'] 47 | username = config['travel_data_filepath']['USERNAME'] 48 | password = config['travel_data_filepath']['PASSWORD'] 49 | 50 | postgres_connection = None 51 | cursor = None 52 | 53 | 54 | 55 | # Connect to the Postgres database 56 | try: 57 | pgsql_connection = psycopg2.connect( 58 | host = host, 59 | port = port, 60 | dbname = database, 61 | user = username, 62 | password = password, 63 | ) 64 | 65 | 66 | # Create a cursor object to execute the PG-SQL commands 67 | cursor = pgsql_connection.cursor() 68 | 69 | 70 | except psycopg2.Error: 71 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 72 | 73 | 74 | 75 | # Define the database, schema and table names 76 | 77 | table_name = 'stg_flight_destinations_tbl' 78 | schema_name = 'dev' 79 | database_name = database 80 | 81 | 82 | 83 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 84 | 85 | 86 | """ Test the connection to the Postgres database is successful or not """ 87 | 88 | def test_database_connection(): 89 | 90 | # Assert the existence of a valid connection to the database (i.e. not None) 91 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 92 | 93 | 94 | 95 | 96 | 97 | 98 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 99 | 100 | 101 | """ Verify the staging schema exists in the Postgres staging database """ 102 | 103 | 104 | 105 | def test_schema_existence(): 106 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 107 | """ 108 | cursor.execute(sql_query) 109 | 110 | sql_results = cursor.fetchall() 111 | schemas = [schema[0] for schema in sql_results] 112 | 113 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 114 | 115 | 116 | 117 | 118 | 119 | 120 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 121 | 122 | """ Verify the columns of this table exists in the Postgres staging database """ 123 | 124 | 125 | 126 | def test_columns_existence(): 127 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 128 | """ 129 | cursor.execute(sql_query) 130 | 131 | sql_results = cursor.fetchall() 132 | actual_columns = [column[0] for column in sql_results] 133 | 134 | expected_columns = ['flight_id', 135 | 'arrival_city', 136 | 'departure_city', 137 | 'created_at', 138 | 'updated_at', 139 | 'source_system', 140 | 'source_file', 141 | 'load_timestamp', 142 | 'dwh_layer' 143 | ] 144 | 145 | for expected_column in expected_columns: 146 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 147 | 148 | 149 | 150 | 151 | 152 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 153 | 154 | 155 | """ Check if the active table is in the Postgres staging database """ 156 | 157 | 158 | def test_table_existence(): 159 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 160 | cursor.execute(sql_query) 161 | sql_result = cursor.fetchone() 162 | 163 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 164 | 165 | 166 | 167 | 168 | 169 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 170 | 171 | 172 | """ Test if each column is mapped to the expected data type in Postgres """ 173 | 174 | 175 | def test_column_data_types(): 176 | 177 | # Create a dictionary that specifies the expected data types for each column 178 | expected_data_types = { 179 | 'flight_id' : "uuid", 180 | 'arrival_city' : "character varying", 181 | 'departure_city' : "character varying", 182 | "created_at" : "timestamp with time zone", 183 | "updated_at" : "timestamp with time zone", 184 | "source_system" : "character varying", 185 | "source_file" : "character varying", 186 | "load_timestamp" : "timestamp without time zone", 187 | "dwh_layer" : "character varying" 188 | 189 | } 190 | 191 | 192 | 193 | # Use SQL to extract the column names and their data types 194 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 195 | """ 196 | cursor.execute(sql_query) 197 | 198 | sql_results = cursor.fetchall() 199 | 200 | for column_name, actual_data_type in sql_results: 201 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 202 | 203 | 204 | 205 | 206 | 207 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 208 | 209 | 210 | """ Check if there are any empty values present in your table """ 211 | 212 | def test_empty_values_in_table(): 213 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 214 | """ 215 | cursor.execute(sql_query) 216 | sql_results = cursor.fetchall() 217 | 218 | row_no = 0 219 | for record in sql_results: 220 | row_no +=1 221 | for cell_value in record: 222 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 223 | 224 | 225 | 226 | 227 | 228 | 229 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 230 | 231 | """ Check if there are any NULL values present in your table """ 232 | 233 | def test_null_values_in_table(): 234 | 235 | # Get list of columns from table 236 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 237 | """) 238 | columns = cursor.fetchall() 239 | 240 | 241 | for column in columns: 242 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 243 | cursor.execute(sql_query) 244 | sql_result = cursor.fetchone() 245 | 246 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | # ====================================== TEST 8: DATE FORMATTING CHECK ====================================== 255 | 256 | 257 | """ Check the date columns contain values in the 'yyyy-mm-dd' format """ 258 | 259 | def test_date_formatting_constraint(): 260 | expected_date_format = r"^\d{4}-\d{2}-\d{2}$" 261 | data_type = 'date' 262 | 263 | sql_query_1 = f''' SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}' AND data_type = '{data_type}' ''' 264 | cursor.execute(sql_query_1) 265 | 266 | sql_results_1 = cursor.fetchall() 267 | date_columns = [sql_result[0] for sql_result in sql_results_1] 268 | 269 | for date_column in date_columns: 270 | sql_query_2 = f""" SELECT {date_column} 271 | FROM {schema_name}.{table_name} 272 | """ 273 | cursor.execute(sql_query_2) 274 | sql_results_2 = cursor.fetchall() 275 | for sql_result in sql_results_2: 276 | date_value = sql_result[0].strftime("%Y-%m-%d") 277 | assert re.match(expected_date_format, date_value) is not None, f"Invalid date detected - date values should be in 'yyyy-mm-dd' format." 278 | 279 | 280 | 281 | 282 | # ====================================== TEST 9: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 283 | 284 | """ Test all the ID columns in the table contain 36 characters in length """ 285 | 286 | def test_id_char_length_constraint(): 287 | expected_id_char_length = 36 288 | sql_results = cursor.fetchall() 289 | 290 | 291 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 292 | """ 293 | cursor.execute(sql_query) 294 | 295 | sql_results = cursor.fetchall() 296 | 297 | 298 | # Assert the number of characters for the id column is equal to 36 299 | for sql_result in sql_results: 300 | id_column = sql_result[0] 301 | actual_id_length = len(id_column) 302 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 303 | 304 | 305 | 306 | 307 | 308 | # ====================================== TEST 10: DUPLICATES CHECK ====================================== 309 | 310 | 311 | """ Test the number of duplicate records appearing in the Postgres table """ 312 | 313 | def test_duplicate_records_count(): 314 | column_name = "flight_id" 315 | sql_query = f""" SELECT {column_name}, 316 | COUNT (*) 317 | FROM {schema_name}.{table_name} 318 | GROUP BY {column_name} 319 | HAVING COUNT(*) > 1 320 | ; 321 | """ 322 | cursor.execute(sql_query) 323 | 324 | duplicates = cursor.fetchall() 325 | total_no_of_duplicates = len(duplicates) 326 | 327 | # Assert the number of uniqueness constraints for the table specified is at least 1 328 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 329 | 330 | 331 | 332 | def run_tests(): 333 | test_filepath = os.path.abspath('dwh_pipelines/L2_staging_layer/tests/test_stg_flight_destinations_tbl.py') 334 | test_result = pytest.main([test_filepath]) 335 | return test_result 336 | 337 | 338 | 339 | if __name__ == "__main__": 340 | # Run DQ tests 341 | test_result = run_tests() 342 | 343 | # Create DQ reports in HTML format 344 | from pathlib import Path 345 | import webbrowser 346 | file_path = os.path.abspath(__file__) 347 | current_filepath = Path(__file__).stem 348 | html_report_path = f"{current_filepath}.html" 349 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 350 | 351 | # Open DQ reports in browser 352 | # dq_report_url = Path.cwd() / html_report_path 353 | # webbrowser.open(dq_report_url.as_uri()) 354 | sys.exit() 355 | -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/test_stg_flight_promotion_deals_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pytest 4 | import psycopg2 5 | import configparser 6 | 7 | 8 | 9 | 10 | # ================================================ CONFIG ================================================ 11 | 12 | # Add a flag/switch indicating whether Airflow is in use or not 13 | USING_AIRFLOW = False 14 | 15 | 16 | 17 | # Create a config file for storing environment variables 18 | config = configparser.ConfigParser() 19 | if USING_AIRFLOW: 20 | 21 | # Use the airflow config file from the airflow container 22 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 23 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 24 | 25 | host = config['postgres_airflow_config']['HOST'] 26 | port = config['postgres_airflow_config']['PORT'] 27 | database = config['postgres_airflow_config']['STAGING_DB'] 28 | username = config['postgres_airflow_config']['USERNAME'] 29 | password = config['postgres_airflow_config']['PASSWORD'] 30 | 31 | postgres_connection = None 32 | cursor = None 33 | 34 | 35 | else: 36 | 37 | # Use the local config file from the local machine 38 | path = os.path.abspath('dwh_pipelines/local_config.ini') 39 | config.read(path) 40 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 41 | 42 | host = config['travel_data_filepath']['HOST'] 43 | port = config['travel_data_filepath']['PORT'] 44 | database = config['travel_data_filepath']['STAGING_DB'] 45 | username = config['travel_data_filepath']['USERNAME'] 46 | password = config['travel_data_filepath']['PASSWORD'] 47 | 48 | postgres_connection = None 49 | cursor = None 50 | 51 | 52 | 53 | # Connect to the Postgres database 54 | try: 55 | pgsql_connection = psycopg2.connect( 56 | host = host, 57 | port = port, 58 | dbname = database, 59 | user = username, 60 | password = password, 61 | ) 62 | 63 | 64 | # Create a cursor object to execute the PG-SQL commands 65 | cursor = pgsql_connection.cursor() 66 | 67 | 68 | except psycopg2.Error: 69 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 70 | 71 | 72 | 73 | # Define the database, schema and table names 74 | 75 | table_name = 'stg_flight_promotion_deals_tbl' 76 | schema_name = 'dev' 77 | database_name = database 78 | 79 | 80 | 81 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 82 | 83 | 84 | """ Test the connection to the Postgres database is successful or not """ 85 | 86 | def test_database_connection(): 87 | 88 | # Assert the existence of a valid connection to the database (i.e. not None) 89 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 90 | 91 | 92 | 93 | 94 | 95 | 96 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 97 | 98 | 99 | """ Verify the staging schema exists in the Postgres staging database """ 100 | 101 | 102 | 103 | def test_schema_existence(): 104 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 105 | """ 106 | cursor.execute(sql_query) 107 | 108 | sql_results = cursor.fetchall() 109 | schemas = [schema[0] for schema in sql_results] 110 | 111 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 112 | 113 | 114 | 115 | 116 | 117 | 118 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 119 | 120 | """ Verify the columns of this table exists in the Postgres staging database """ 121 | 122 | 123 | 124 | def test_columns_existence(): 125 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 126 | """ 127 | cursor.execute(sql_query) 128 | 129 | sql_results = cursor.fetchall() 130 | actual_columns = [column[0] for column in sql_results] 131 | 132 | expected_columns = ['promotion_id', 133 | 'promotion_name', 134 | 'flight_booking_id', 135 | 'applied_discount', 136 | 'created_at', 137 | 'updated_at', 138 | 'source_system', 139 | 'source_file', 140 | 'load_timestamp', 141 | 'dwh_layer' 142 | ] 143 | 144 | for expected_column in expected_columns: 145 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 146 | 147 | 148 | 149 | 150 | 151 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 152 | 153 | 154 | """ Check if the active table is in the Postgres staging database """ 155 | 156 | 157 | def test_table_existence(): 158 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 159 | cursor.execute(sql_query) 160 | sql_result = cursor.fetchone() 161 | 162 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 163 | 164 | 165 | 166 | 167 | 168 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 169 | 170 | 171 | """ Test if each column is mapped to the expected data type in Postgres """ 172 | 173 | 174 | def test_column_data_types(): 175 | 176 | # Create a dictionary that specifies the expected data types for each column 177 | expected_data_types = { 178 | 'promotion_id' : "uuid", 179 | 'promotion_name' : "character varying", 180 | 'flight_booking_id' : "uuid", 181 | 'applied_discount' : "numeric", 182 | "created_at" : "timestamp with time zone", 183 | "updated_at" : "timestamp with time zone", 184 | "source_system" : "character varying", 185 | "source_file" : "character varying", 186 | "load_timestamp" : "timestamp without time zone", 187 | "dwh_layer" : "character varying" 188 | 189 | } 190 | 191 | 192 | 193 | # Use SQL to extract the column names and their data types 194 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 195 | """ 196 | cursor.execute(sql_query) 197 | 198 | sql_results = cursor.fetchall() 199 | 200 | for column_name, actual_data_type in sql_results: 201 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 202 | 203 | 204 | 205 | 206 | 207 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 208 | 209 | 210 | """ Check if there are any empty values present in your table """ 211 | 212 | def test_empty_values_in_table(): 213 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 214 | """ 215 | cursor.execute(sql_query) 216 | sql_results = cursor.fetchall() 217 | 218 | row_no = 0 219 | for record in sql_results: 220 | row_no +=1 221 | for cell_value in record: 222 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 223 | 224 | 225 | 226 | 227 | 228 | 229 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 230 | 231 | """ Check if there are any NULL values present in your table """ 232 | 233 | def test_null_values_in_table(): 234 | 235 | # Get list of columns from table 236 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 237 | """) 238 | columns = cursor.fetchall() 239 | 240 | 241 | for column in columns: 242 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 243 | cursor.execute(sql_query) 244 | sql_result = cursor.fetchone() 245 | 246 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | # ====================================== TEST 8: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 255 | 256 | """ Test all the ID columns in the table contain 36 characters in length """ 257 | 258 | def test_id_char_length_constraint(): 259 | expected_id_char_length = 36 260 | sql_results = cursor.fetchall() 261 | 262 | 263 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 264 | """ 265 | cursor.execute(sql_query) 266 | 267 | sql_results = cursor.fetchall() 268 | 269 | 270 | # Assert the number of characters for the id column is equal to 36 271 | for sql_result in sql_results: 272 | id_column = sql_result[0] 273 | actual_id_length = len(id_column) 274 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 275 | 276 | 277 | 278 | 279 | # ====================================== TEST 9: DUPLICATES CHECK ====================================== 280 | 281 | 282 | """ Test the number of duplicate records appearing in the Postgres table """ 283 | 284 | def test_duplicate_records_count(): 285 | column_name = "promotion_id" 286 | sql_query = f""" SELECT {column_name}, 287 | COUNT (*) 288 | FROM {schema_name}.{table_name} 289 | GROUP BY {column_name} 290 | HAVING COUNT(*) > 1 291 | ; 292 | """ 293 | cursor.execute(sql_query) 294 | 295 | duplicates = cursor.fetchall() 296 | total_no_of_duplicates = len(duplicates) 297 | 298 | # Assert the number of uniqueness constraints for the table specified is at least 1 299 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 300 | 301 | 302 | 303 | 304 | def run_tests(): 305 | test_filepath = os.path.abspath('dwh_pipelines/L2_staging_layer/tests/test_stg_flight_promotion_deals_tbl.py') 306 | test_result = pytest.main([test_filepath]) 307 | return test_result 308 | 309 | 310 | 311 | if __name__ == "__main__": 312 | # Run DQ tests 313 | test_result = run_tests() 314 | 315 | # Create DQ reports in HTML format 316 | from pathlib import Path 317 | import webbrowser 318 | file_path = os.path.abspath(__file__) 319 | current_filepath = Path(__file__).stem 320 | html_report_path = f"{current_filepath}.html" 321 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 322 | 323 | # Open DQ reports in browser 324 | # dq_report_url = Path.cwd() / html_report_path 325 | # webbrowser.open(dq_report_url.as_uri()) 326 | sys.exit() 327 | -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/test_stg_flight_schedules_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import pytest 5 | import psycopg2 6 | import configparser 7 | from datetime import datetime 8 | 9 | 10 | 11 | 12 | # ================================================ CONFIG ================================================ 13 | 14 | # Add a flag/switch indicating whether Airflow is in use or not 15 | USING_AIRFLOW = False 16 | 17 | 18 | 19 | # Create a config file for storing environment variables 20 | config = configparser.ConfigParser() 21 | if USING_AIRFLOW: 22 | 23 | # Use the airflow config file from the airflow container 24 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 25 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 26 | 27 | host = config['postgres_airflow_config']['HOST'] 28 | port = config['postgres_airflow_config']['PORT'] 29 | database = config['postgres_airflow_config']['STAGING_DB'] 30 | username = config['postgres_airflow_config']['USERNAME'] 31 | password = config['postgres_airflow_config']['PASSWORD'] 32 | 33 | postgres_connection = None 34 | cursor = None 35 | 36 | 37 | else: 38 | 39 | # Use the local config file from the local machine 40 | path = os.path.abspath('dwh_pipelines/local_config.ini') 41 | config.read(path) 42 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 43 | 44 | host = config['travel_data_filepath']['HOST'] 45 | port = config['travel_data_filepath']['PORT'] 46 | database = config['travel_data_filepath']['STAGING_DB'] 47 | username = config['travel_data_filepath']['USERNAME'] 48 | password = config['travel_data_filepath']['PASSWORD'] 49 | 50 | postgres_connection = None 51 | cursor = None 52 | 53 | 54 | 55 | # Connect to the Postgres database 56 | try: 57 | pgsql_connection = psycopg2.connect( 58 | host = host, 59 | port = port, 60 | dbname = database, 61 | user = username, 62 | password = password, 63 | ) 64 | 65 | 66 | # Create a cursor object to execute the PG-SQL commands 67 | cursor = pgsql_connection.cursor() 68 | 69 | 70 | except psycopg2.Error: 71 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 72 | 73 | 74 | 75 | # Define the database, schema and table names 76 | 77 | table_name = 'stg_flight_schedules_tbl' 78 | schema_name = 'dev' 79 | database_name = database 80 | 81 | 82 | 83 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 84 | 85 | 86 | """ Test the connection to the Postgres database is successful or not """ 87 | 88 | def test_database_connection(): 89 | 90 | # Assert the existence of a valid connection to the database (i.e. not None) 91 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 92 | 93 | 94 | 95 | 96 | 97 | 98 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 99 | 100 | 101 | """ Verify the staging schema exists in the Postgres staging database """ 102 | 103 | 104 | 105 | def test_schema_existence(): 106 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 107 | """ 108 | cursor.execute(sql_query) 109 | 110 | sql_results = cursor.fetchall() 111 | schemas = [schema[0] for schema in sql_results] 112 | 113 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 114 | 115 | 116 | 117 | 118 | 119 | 120 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 121 | 122 | """ Verify the columns of this table exists in the Postgres staging database """ 123 | 124 | 125 | 126 | def test_columns_existence(): 127 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 128 | """ 129 | cursor.execute(sql_query) 130 | 131 | sql_results = cursor.fetchall() 132 | actual_columns = [column[0] for column in sql_results] 133 | 134 | expected_columns = ['flight_id', 135 | 'arrival_city', 136 | 'arrival_date', 137 | 'arrival_time', 138 | 'departure_city', 139 | 'departure_time', 140 | 'duration', 141 | 'flight_date', 142 | ] 143 | 144 | for expected_column in expected_columns: 145 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 146 | 147 | 148 | 149 | 150 | 151 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 152 | 153 | 154 | """ Check if the active table is in the Postgres staging database """ 155 | 156 | 157 | def test_table_existence(): 158 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 159 | cursor.execute(sql_query) 160 | sql_result = cursor.fetchone() 161 | 162 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 163 | 164 | 165 | 166 | 167 | 168 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 169 | 170 | 171 | """ Test if each column is mapped to the expected data type in Postgres """ 172 | 173 | 174 | def test_column_data_types(): 175 | 176 | # Create a dictionary that specifies the expected data types for each column 177 | expected_data_types = { 178 | 'flight_id' : "uuid", 179 | 'arrival_city' : "character varying", 180 | 'arrival_date' : "date", 181 | 'arrival_time' : "time without time zone", 182 | 'departure_city' : "character varying", 183 | 'departure_time' : "time without time zone", 184 | 'duration' : "numeric", 185 | 'flight_date' : "date", 186 | "created_at" : "timestamp with time zone", 187 | "updated_at" : "timestamp with time zone", 188 | "source_system" : "character varying", 189 | "source_file" : "character varying", 190 | "load_timestamp" : "timestamp without time zone", 191 | "dwh_layer" : "character varying" 192 | 193 | } 194 | 195 | 196 | 197 | # Use SQL to extract the column names and their data types 198 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 199 | """ 200 | cursor.execute(sql_query) 201 | 202 | sql_results = cursor.fetchall() 203 | 204 | for column_name, actual_data_type in sql_results: 205 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 206 | 207 | 208 | 209 | 210 | 211 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 212 | 213 | 214 | """ Check if there are any empty values present in your table """ 215 | 216 | def test_empty_values_in_table(): 217 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 218 | """ 219 | cursor.execute(sql_query) 220 | sql_results = cursor.fetchall() 221 | 222 | row_no = 0 223 | for record in sql_results: 224 | row_no +=1 225 | for cell_value in record: 226 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 227 | 228 | 229 | 230 | 231 | 232 | 233 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 234 | 235 | """ Check if there are any NULL values present in your table """ 236 | 237 | def test_null_values_in_table(): 238 | 239 | # Get list of columns from table 240 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 241 | """) 242 | columns = cursor.fetchall() 243 | 244 | 245 | for column in columns: 246 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 247 | cursor.execute(sql_query) 248 | sql_result = cursor.fetchone() 249 | 250 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 251 | 252 | 253 | 254 | 255 | 256 | 257 | 258 | # ====================================== TEST 8: DATE FORMATTING CHECK ====================================== 259 | 260 | 261 | """ Check the date columns contain values in the 'yyyy-mm-dd' format """ 262 | 263 | def test_date_formatting_constraint(): 264 | expected_date_format = r"^\d{4}-\d{2}-\d{2}$" 265 | data_type = 'date' 266 | 267 | sql_query_1 = f''' SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}' AND data_type = '{data_type}' ''' 268 | cursor.execute(sql_query_1) 269 | 270 | sql_results_1 = cursor.fetchall() 271 | date_columns = [sql_result[0] for sql_result in sql_results_1] 272 | 273 | for date_column in date_columns: 274 | sql_query_2 = f""" SELECT {date_column} 275 | FROM {schema_name}.{table_name} 276 | """ 277 | cursor.execute(sql_query_2) 278 | sql_results_2 = cursor.fetchall() 279 | for sql_result in sql_results_2: 280 | date_value = sql_result[0].strftime("%Y-%m-%d") 281 | assert re.match(expected_date_format, date_value) is not None, f"Invalid date detected - date values should be in 'yyyy-mm-dd' format." 282 | 283 | 284 | 285 | 286 | # ====================================== TEST 9: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 287 | 288 | """ Test all the ID columns in the table contain 36 characters in length """ 289 | 290 | def test_id_char_length_constraint(): 291 | expected_id_char_length = 36 292 | sql_results = cursor.fetchall() 293 | 294 | 295 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 296 | """ 297 | cursor.execute(sql_query) 298 | 299 | sql_results = cursor.fetchall() 300 | 301 | 302 | # Assert the number of characters for the id column is equal to 36 303 | for sql_result in sql_results: 304 | id_column = sql_result[0] 305 | actual_id_length = len(id_column) 306 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 307 | 308 | 309 | 310 | 311 | # ====================================== TEST 10: DUPLICATES CHECK ====================================== 312 | 313 | 314 | """ Test the number of duplicate records appearing in the Postgres table """ 315 | 316 | def test_duplicate_records_count(): 317 | column_name = "flight_id" 318 | sql_query = f""" SELECT {column_name}, 319 | COUNT (*) 320 | FROM {schema_name}.{table_name} 321 | GROUP BY {column_name} 322 | HAVING COUNT(*) > 1 323 | ; 324 | """ 325 | cursor.execute(sql_query) 326 | 327 | duplicates = cursor.fetchall() 328 | total_no_of_duplicates = len(duplicates) 329 | 330 | # Assert the number of uniqueness constraints for the table specified is at least 1 331 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 332 | 333 | 334 | 335 | 336 | def run_tests(): 337 | test_filepath = os.path.abspath('dwh_pipelines/L2_staging_layer/tests/test_stg_flight_schedules_tbl.py') 338 | test_result = pytest.main([test_filepath]) 339 | return test_result 340 | 341 | 342 | 343 | if __name__ == "__main__": 344 | # Run DQ tests 345 | test_result = run_tests() 346 | 347 | # Create DQ reports in HTML format 348 | from pathlib import Path 349 | import webbrowser 350 | file_path = os.path.abspath(__file__) 351 | current_filepath = Path(__file__).stem 352 | html_report_path = f"{current_filepath}.html" 353 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 354 | 355 | # Open DQ reports in browser 356 | # dq_report_url = Path.cwd() / html_report_path 357 | # webbrowser.open(dq_report_url.as_uri()) 358 | sys.exit() 359 | -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/test_stg_flight_ticket_sales_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import pytest 5 | import psycopg2 6 | import configparser 7 | from datetime import datetime 8 | 9 | 10 | 11 | 12 | # ================================================ CONFIG ================================================ 13 | 14 | # Add a flag/switch indicating whether Airflow is in use or not 15 | USING_AIRFLOW = False 16 | 17 | 18 | 19 | # Create a config file for storing environment variables 20 | config = configparser.ConfigParser() 21 | if USING_AIRFLOW: 22 | 23 | # Use the airflow config file from the airflow container 24 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 25 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 26 | 27 | host = config['postgres_airflow_config']['HOST'] 28 | port = config['postgres_airflow_config']['PORT'] 29 | database = config['postgres_airflow_config']['STAGING_DB'] 30 | username = config['postgres_airflow_config']['USERNAME'] 31 | password = config['postgres_airflow_config']['PASSWORD'] 32 | 33 | postgres_connection = None 34 | cursor = None 35 | 36 | 37 | else: 38 | 39 | # Use the local config file from the local machine 40 | path = os.path.abspath('dwh_pipelines/local_config.ini') 41 | config.read(path) 42 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 43 | 44 | host = config['travel_data_filepath']['HOST'] 45 | port = config['travel_data_filepath']['PORT'] 46 | database = config['travel_data_filepath']['STAGING_DB'] 47 | username = config['travel_data_filepath']['USERNAME'] 48 | password = config['travel_data_filepath']['PASSWORD'] 49 | 50 | postgres_connection = None 51 | cursor = None 52 | 53 | 54 | 55 | # Connect to the Postgres database 56 | try: 57 | pgsql_connection = psycopg2.connect( 58 | host = host, 59 | port = port, 60 | dbname = database, 61 | user = username, 62 | password = password, 63 | ) 64 | 65 | 66 | # Create a cursor object to execute the PG-SQL commands 67 | cursor = pgsql_connection.cursor() 68 | 69 | 70 | except psycopg2.Error: 71 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 72 | 73 | 74 | 75 | # Define the database, schema and table names 76 | 77 | table_name = 'stg_flight_ticket_sales_tbl' 78 | schema_name = 'dev' 79 | database_name = database 80 | 81 | 82 | 83 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 84 | 85 | 86 | """ Test the connection to the Postgres database is successful or not """ 87 | 88 | def test_database_connection(): 89 | 90 | # Assert the existence of a valid connection to the database (i.e. not None) 91 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 92 | 93 | 94 | 95 | 96 | 97 | 98 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 99 | 100 | 101 | """ Verify the staging schema exists in the Postgres staging database """ 102 | 103 | 104 | 105 | def test_schema_existence(): 106 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 107 | """ 108 | cursor.execute(sql_query) 109 | 110 | sql_results = cursor.fetchall() 111 | schemas = [schema[0] for schema in sql_results] 112 | 113 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 114 | 115 | 116 | 117 | 118 | 119 | 120 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 121 | 122 | """ Verify the columns of this table exists in the Postgres staging database """ 123 | 124 | 125 | 126 | def test_columns_existence(): 127 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 128 | """ 129 | cursor.execute(sql_query) 130 | 131 | sql_results = cursor.fetchall() 132 | actual_columns = [column[0] for column in sql_results] 133 | 134 | expected_columns = ['agent_first_name', 135 | 'agent_id', 136 | 'agent_last_name', 137 | 'customer_first_name', 138 | 'customer_id', 139 | 'customer_last_name', 140 | 'discount', 141 | 'flight_booking_id', 142 | 'promotion_id', 143 | 'promotion_name', 144 | 'ticket_sales', 145 | 'ticket_sales_date', 146 | 'created_at', 147 | 'updated_at', 148 | 'source_system', 149 | 'source_file', 150 | 'load_timestamp', 151 | 'dwh_layer' 152 | ] 153 | 154 | for expected_column in expected_columns: 155 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 156 | 157 | 158 | 159 | 160 | 161 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 162 | 163 | 164 | """ Check if the active table is in the Postgres staging database """ 165 | 166 | 167 | def test_table_existence(): 168 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 169 | cursor.execute(sql_query) 170 | sql_result = cursor.fetchone() 171 | 172 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 173 | 174 | 175 | 176 | 177 | 178 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 179 | 180 | 181 | """ Test if each column is mapped to the expected data type in Postgres """ 182 | 183 | 184 | def test_column_data_types(): 185 | 186 | # Create a dictionary that specifies the expected data types for each column 187 | expected_data_types = { 188 | 'agent_first_name' : "character varying", 189 | 'agent_id' : "uuid", 190 | 'agent_last_name' : "character varying", 191 | 'customer_first_name' : "character varying", 192 | 'customer_id' : "uuid", 193 | 'customer_last_name' : "character varying", 194 | 'discount' : "numeric", 195 | 'flight_booking_id' : "uuid", 196 | 'promotion_id' : "uuid", 197 | 'promotion_name' : "character varying", 198 | 'ticket_sales' : "character varying", 199 | 'ticket_sales_date' : "date", 200 | "created_at" : "timestamp with time zone", 201 | "updated_at" : "timestamp with time zone", 202 | "source_system" : "character varying", 203 | "source_file" : "character varying", 204 | "load_timestamp" : "timestamp without time zone", 205 | "dwh_layer" : "character varying" 206 | 207 | } 208 | 209 | 210 | 211 | # Use SQL to extract the column names and their data types 212 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 213 | """ 214 | cursor.execute(sql_query) 215 | 216 | sql_results = cursor.fetchall() 217 | 218 | for column_name, actual_data_type in sql_results: 219 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 220 | 221 | 222 | 223 | 224 | 225 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 226 | 227 | 228 | """ Check if there are any empty values present in your table """ 229 | 230 | def test_empty_values_in_table(): 231 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 232 | """ 233 | cursor.execute(sql_query) 234 | sql_results = cursor.fetchall() 235 | 236 | row_no = 0 237 | for record in sql_results: 238 | row_no +=1 239 | for cell_value in record: 240 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 241 | 242 | 243 | 244 | 245 | 246 | 247 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 248 | 249 | """ Check if there are any NULL values present in your table """ 250 | 251 | def test_null_values_in_table(): 252 | 253 | # Get list of columns from table 254 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 255 | """) 256 | columns = cursor.fetchall() 257 | 258 | 259 | for column in columns: 260 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 261 | cursor.execute(sql_query) 262 | sql_result = cursor.fetchone() 263 | 264 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 265 | 266 | 267 | 268 | 269 | 270 | 271 | 272 | # ====================================== TEST 8: DATE FORMATTING CHECK ====================================== 273 | 274 | 275 | """ Check the date columns contain values in the 'yyyy-mm-dd' format """ 276 | 277 | def test_date_formatting_constraint(): 278 | expected_date_format = r"^\d{4}-\d{2}-\d{2}$" 279 | data_type = 'date' 280 | 281 | sql_query_1 = f''' SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}' AND data_type = '{data_type}' ''' 282 | cursor.execute(sql_query_1) 283 | 284 | sql_results_1 = cursor.fetchall() 285 | date_columns = [sql_result[0] for sql_result in sql_results_1] 286 | 287 | for date_column in date_columns: 288 | sql_query_2 = f""" SELECT {date_column} 289 | FROM {schema_name}.{table_name} 290 | """ 291 | cursor.execute(sql_query_2) 292 | sql_results_2 = cursor.fetchall() 293 | for sql_result in sql_results_2: 294 | date_value = sql_result[0].strftime("%Y-%m-%d") 295 | assert re.match(expected_date_format, date_value) is not None, f"Invalid date detected - date values should be in 'yyyy-mm-dd' format." 296 | 297 | 298 | 299 | 300 | # ====================================== TEST 9: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 301 | 302 | """ Test all the ID columns in the table contain 36 characters in length """ 303 | 304 | def test_id_char_length_constraint(): 305 | expected_id_char_length = 36 306 | sql_results = cursor.fetchall() 307 | 308 | 309 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 310 | """ 311 | cursor.execute(sql_query) 312 | 313 | sql_results = cursor.fetchall() 314 | 315 | 316 | # Assert the number of characters for the id column is equal to 36 317 | for sql_result in sql_results: 318 | id_column = sql_result[0] 319 | actual_id_length = len(id_column) 320 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 321 | 322 | 323 | 324 | 325 | 326 | 327 | # ====================================== TEST 10: DUPLICATES CHECK ====================================== 328 | 329 | 330 | """ Test the number of duplicate records appearing in the Postgres table """ 331 | 332 | def test_duplicate_records_count(): 333 | column_name = "flight_booking_id" 334 | sql_query = f""" SELECT {column_name}, 335 | COUNT (*) 336 | FROM {schema_name}.{table_name} 337 | GROUP BY {column_name} 338 | HAVING COUNT(*) > 1 339 | ; 340 | """ 341 | cursor.execute(sql_query) 342 | 343 | duplicates = cursor.fetchall() 344 | total_no_of_duplicates = len(duplicates) 345 | 346 | # Assert the number of uniqueness constraints for the table specified is at least 1 347 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 348 | 349 | 350 | 351 | def run_tests(): 352 | test_filepath = os.path.abspath('dwh_pipelines/L2_staging_layer/tests/test_stg_flight_ticket_sales_tbl.py') 353 | test_result = pytest.main([test_filepath]) 354 | return test_result 355 | 356 | 357 | 358 | if __name__ == "__main__": 359 | # Run DQ tests 360 | test_result = run_tests() 361 | 362 | # Create DQ reports in HTML format 363 | from pathlib import Path 364 | import webbrowser 365 | file_path = os.path.abspath(__file__) 366 | current_filepath = Path(__file__).stem 367 | html_report_path = f"{current_filepath}.html" 368 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 369 | 370 | # Open DQ reports in browser 371 | # dq_report_url = Path.cwd() / html_report_path 372 | # webbrowser.open(dq_report_url.as_uri()) 373 | sys.exit() 374 | 375 | -------------------------------------------------------------------------------- /dwh_pipelines/L2_staging_layer/tests/test_stg_ticket_prices_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | import sys 4 | import pytest 5 | import psycopg2 6 | import configparser 7 | from datetime import datetime 8 | 9 | 10 | 11 | 12 | # ================================================ CONFIG ================================================ 13 | 14 | # Add a flag/switch indicating whether Airflow is in use or not 15 | USING_AIRFLOW = False 16 | 17 | 18 | 19 | # Create a config file for storing environment variables 20 | config = configparser.ConfigParser() 21 | if USING_AIRFLOW: 22 | 23 | # Use the airflow config file from the airflow container 24 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 25 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 26 | 27 | host = config['postgres_airflow_config']['HOST'] 28 | port = config['postgres_airflow_config']['PORT'] 29 | database = config['postgres_airflow_config']['STAGING_DB'] 30 | username = config['postgres_airflow_config']['USERNAME'] 31 | password = config['postgres_airflow_config']['PASSWORD'] 32 | 33 | postgres_connection = None 34 | cursor = None 35 | 36 | 37 | else: 38 | 39 | # Use the local config file from the local machine 40 | path = os.path.abspath('dwh_pipelines/local_config.ini') 41 | config.read(path) 42 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 43 | 44 | host = config['travel_data_filepath']['HOST'] 45 | port = config['travel_data_filepath']['PORT'] 46 | database = config['travel_data_filepath']['STAGING_DB'] 47 | username = config['travel_data_filepath']['USERNAME'] 48 | password = config['travel_data_filepath']['PASSWORD'] 49 | 50 | postgres_connection = None 51 | cursor = None 52 | 53 | 54 | 55 | # Connect to the Postgres database 56 | try: 57 | pgsql_connection = psycopg2.connect( 58 | host = host, 59 | port = port, 60 | dbname = database, 61 | user = username, 62 | password = password, 63 | ) 64 | 65 | 66 | # Create a cursor object to execute the PG-SQL commands 67 | cursor = pgsql_connection.cursor() 68 | 69 | 70 | except psycopg2.Error: 71 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 72 | 73 | 74 | 75 | # Define the database, schema and table names 76 | 77 | table_name = 'stg_ticket_prices_tbl' 78 | schema_name = 'dev' 79 | database_name = database 80 | 81 | 82 | 83 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 84 | 85 | 86 | """ Test the connection to the Postgres database is successful or not """ 87 | 88 | def test_database_connection(): 89 | 90 | # Assert the existence of a valid connection to the database (i.e. not None) 91 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 92 | 93 | 94 | 95 | 96 | 97 | 98 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 99 | 100 | 101 | """ Verify the staging schema exists in the Postgres staging database """ 102 | 103 | 104 | 105 | def test_schema_existence(): 106 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 107 | """ 108 | cursor.execute(sql_query) 109 | 110 | sql_results = cursor.fetchall() 111 | schemas = [schema[0] for schema in sql_results] 112 | 113 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 114 | 115 | 116 | 117 | 118 | 119 | 120 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 121 | 122 | """ Verify the columns of this table exists in the Postgres staging database """ 123 | 124 | 125 | 126 | def test_columns_existence(): 127 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 128 | """ 129 | cursor.execute(sql_query) 130 | 131 | sql_results = cursor.fetchall() 132 | actual_columns = [column[0] for column in sql_results] 133 | 134 | expected_columns = ['flight_id', 135 | 'ticket_price', 136 | 'ticket_price_date', 137 | 'created_at', 138 | 'updated_at', 139 | 'source_system', 140 | 'source_file', 141 | 'load_timestamp', 142 | 'dwh_layer' 143 | ] 144 | 145 | for expected_column in expected_columns: 146 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 147 | 148 | 149 | 150 | 151 | 152 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 153 | 154 | 155 | """ Check if the active table is in the Postgres staging database """ 156 | 157 | 158 | def test_table_existence(): 159 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 160 | cursor.execute(sql_query) 161 | sql_result = cursor.fetchone() 162 | 163 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 164 | 165 | 166 | 167 | 168 | 169 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 170 | 171 | 172 | """ Test if each column is mapped to the expected data type in Postgres """ 173 | 174 | 175 | def test_column_data_types(): 176 | 177 | # Create a dictionary that specifies the expected data types for each column 178 | expected_data_types = { 179 | 'flight_id' : "uuid", 180 | 'ticket_price' : "numeric", 181 | 'ticket_price_date' : "date", 182 | "created_at" : "timestamp with time zone", 183 | "updated_at" : "timestamp with time zone", 184 | "source_system" : "character varying", 185 | "source_file" : "character varying", 186 | "load_timestamp" : "timestamp without time zone", 187 | "dwh_layer" : "character varying" 188 | 189 | } 190 | 191 | 192 | 193 | # Use SQL to extract the column names and their data types 194 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 195 | """ 196 | cursor.execute(sql_query) 197 | 198 | sql_results = cursor.fetchall() 199 | 200 | for column_name, actual_data_type in sql_results: 201 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 202 | 203 | 204 | 205 | 206 | 207 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 208 | 209 | 210 | """ Check if there are any empty values present in your table """ 211 | 212 | def test_empty_values_in_table(): 213 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 214 | """ 215 | cursor.execute(sql_query) 216 | sql_results = cursor.fetchall() 217 | 218 | row_no = 0 219 | for record in sql_results: 220 | row_no +=1 221 | for cell_value in record: 222 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 223 | 224 | 225 | 226 | 227 | 228 | 229 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 230 | 231 | """ Check if there are any NULL values present in your table """ 232 | 233 | def test_null_values_in_table(): 234 | 235 | # Get list of columns from table 236 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 237 | """) 238 | columns = cursor.fetchall() 239 | 240 | 241 | for column in columns: 242 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 243 | cursor.execute(sql_query) 244 | sql_result = cursor.fetchone() 245 | 246 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 247 | 248 | 249 | 250 | 251 | 252 | 253 | 254 | # ====================================== TEST 8: DATE FORMATTING CHECK ====================================== 255 | 256 | 257 | """ Check the date columns contain values in the 'yyyy-mm-dd' format """ 258 | 259 | def test_date_formatting_constraint(): 260 | expected_date_format = r"^\d{4}-\d{2}-\d{2}$" 261 | data_type = 'date' 262 | 263 | sql_query_1 = f''' SELECT column_name FROM information_schema.columns WHERE table_name = '{table_name}' AND data_type = '{data_type}' ''' 264 | cursor.execute(sql_query_1) 265 | 266 | sql_results_1 = cursor.fetchall() 267 | date_columns = [sql_result[0] for sql_result in sql_results_1] 268 | 269 | for date_column in date_columns: 270 | sql_query_2 = f""" SELECT {date_column} 271 | FROM {schema_name}.{table_name} 272 | """ 273 | cursor.execute(sql_query_2) 274 | sql_results_2 = cursor.fetchall() 275 | for sql_result in sql_results_2: 276 | date_value = sql_result[0].strftime("%Y-%m-%d") 277 | assert re.match(expected_date_format, date_value) is not None, f"Invalid date detected - date values should be in 'yyyy-mm-dd' format." 278 | 279 | 280 | 281 | 282 | # ====================================== TEST 9: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 283 | 284 | """ Test all the ID columns in the table contain 36 characters in length """ 285 | 286 | def test_id_char_length_constraint(): 287 | expected_id_char_length = 36 288 | sql_results = cursor.fetchall() 289 | 290 | 291 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 292 | """ 293 | cursor.execute(sql_query) 294 | 295 | sql_results = cursor.fetchall() 296 | 297 | 298 | # Assert the number of characters for the id column is equal to 36 299 | for sql_result in sql_results: 300 | id_column = sql_result[0] 301 | actual_id_length = len(id_column) 302 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 303 | 304 | 305 | 306 | 307 | # ====================================== TEST 10: TICKET PRICE POSITIVE VALUES CHECK ====================================== 308 | 309 | """ Check if the ticket_price column only contains positive values """ 310 | 311 | def test_positive_ticket_price_col(): 312 | sql_column = "ticket_price" 313 | 314 | sql_query = f""" SELECT {sql_column} 315 | FROM {schema_name}.{table_name} 316 | ; 317 | """ 318 | cursor.execute(sql_query) 319 | 320 | sql_results = cursor.fetchall() 321 | 322 | # Assert the values in the ticket_price column are all positive values 323 | for sql_result in sql_results: 324 | ticket_price = sql_result[0] 325 | assert ticket_price > 0, f"Invalid total price detected - total price must be a positive value " 326 | 327 | 328 | 329 | # ====================================== TEST 11: DUPLICATES CHECK ====================================== 330 | 331 | 332 | """ Test the number of duplicate records appearing in the Postgres table """ 333 | 334 | def test_duplicate_records_count(): 335 | column_name = "flight_id" 336 | sql_query = f""" SELECT {column_name}, 337 | COUNT (*) 338 | FROM {schema_name}.{table_name} 339 | GROUP BY {column_name} 340 | HAVING COUNT(*) > 1 341 | ; 342 | """ 343 | cursor.execute(sql_query) 344 | 345 | duplicates = cursor.fetchall() 346 | total_no_of_duplicates = len(duplicates) 347 | 348 | # Assert the number of uniqueness constraints for the table specified is at least 1 349 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 350 | 351 | 352 | 353 | 354 | def run_tests(): 355 | test_filepath = os.path.abspath('dwh_pipelines/L2_staging_layer/tests/test_stg_ticket_prices_tbl.py') 356 | test_result = pytest.main([test_filepath]) 357 | return test_result 358 | 359 | 360 | 361 | if __name__ == "__main__": 362 | # Run DQ tests 363 | test_result = run_tests() 364 | 365 | # Create DQ reports in HTML format 366 | from pathlib import Path 367 | import webbrowser 368 | file_path = os.path.abspath(__file__) 369 | current_filepath = Path(__file__).stem 370 | html_report_path = f"{current_filepath}.html" 371 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 372 | 373 | # Open DQ reports in browser 374 | # dq_report_url = Path.cwd() / html_report_path 375 | # webbrowser.open(dq_report_url.as_uri()) 376 | sys.exit() 377 | -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/L3_email_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from datetime import datetime 4 | from email.mime.multipart import MIMEMultipart 5 | from email.mime.base import MIMEBase 6 | from email.mime.text import MIMEText 7 | from email.utils import COMMASPACE 8 | from email import encoders 9 | from pathlib import Path 10 | from dotenv import load_dotenv 11 | 12 | 13 | # Load environment variables from .env 14 | load_dotenv() 15 | 16 | 17 | 18 | # Set up constants 19 | current_filepath = Path(__file__).stem 20 | 21 | SMTP_PORT = 587 22 | SMTP_HOST_SERVER = "smtp.gmail.com" 23 | CURRENT_TIMESTAMP = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 24 | EMAIL_ADDRESS = os.getenv("SENDER") 25 | EMAIL_PASSWORD = os.getenv("EMAIL_PASSWORD") 26 | SENDER = "Postgres Data Warehouse Program - SDW" 27 | RECIPIENT = os.getenv("RECIPIENT") 28 | 29 | 30 | L0_LOG_DIRECTORY = os.getenv("L0_LOG_DIRECTORY") 31 | L1_LOG_DIRECTORY = os.getenv("L1_LOG_DIRECTORY") 32 | L2_LOG_DIRECTORY = os.getenv("L2_LOG_DIRECTORY") 33 | L3_LOG_DIRECTORY = os.getenv("L3_LOG_DIRECTORY") 34 | L4_LOG_DIRECTORY = os.getenv("L4_LOG_DIRECTORY") 35 | 36 | body_main_subject = "loading data from staging tables into the MDM tables of the Postgres data warehouse" 37 | body = f"""Hi Stephen, 38 | 39 | See attached the logs for {body_main_subject}. 40 | 41 | Regards, 42 | {SENDER} 43 | 44 | """ 45 | 46 | 47 | # Create function for getting the directory paths for log files 48 | def get_log_filepaths(log_directory): 49 | log_filepaths = [] 50 | for root, directories, log_files in os.walk(log_directory): 51 | for filename in log_files: 52 | log_filepath = os.path.join(root, filename) 53 | log_filepaths.append(log_filepath) 54 | return log_filepaths 55 | 56 | 57 | # Create function for attaching log files to email 58 | def attach_log_files_to_email(message, log_filepaths): 59 | for log_file in log_filepaths: 60 | with open(log_file, 'rb') as file: 61 | log_attachment = MIMEBase('application', 'octet-stream') 62 | log_attachment.set_payload(file.read()) 63 | encoders.encode_base64(log_attachment) 64 | log_attachment.add_header('Content-Disposition', f'attachment; filename="{os.path.basename(log_file)}"') 65 | message.attach(log_attachment) 66 | 67 | 68 | 69 | 70 | # ===================================== SETTING UP LOG FILE ATTACHMENTS ===================================== 71 | 72 | 73 | # Get directory paths for log files 74 | semantic_layer_log_directory = get_log_filepaths(L3_LOG_DIRECTORY) 75 | 76 | log_file_counter = 0 77 | for log_file in semantic_layer_log_directory: 78 | log_file_counter += 1 79 | print('') 80 | print(f'Log file {log_file_counter}: {log_file} ') 81 | 82 | 83 | 84 | # ===================================== SETTING UP EMAIL MESSAGE ===================================== 85 | 86 | # Set up constants for email 87 | message = MIMEMultipart() 88 | message["From"] = SENDER 89 | message["To"] = RECIPIENT 90 | message["Subject"] = f"L3 - Semantic Layer Log Files - {CURRENT_TIMESTAMP}" 91 | 92 | 93 | # Add body to the email message 94 | message.attach(MIMEText(body, "plain")) 95 | 96 | 97 | # Attach log files to email 98 | attach_log_files_to_email(message, semantic_layer_log_directory) 99 | 100 | 101 | 102 | # ===================================== SENDING EMAIL MESSAGE ===================================== 103 | 104 | def send_email(): 105 | with smtplib.SMTP(host=SMTP_HOST_SERVER, port=SMTP_PORT) as smtp: 106 | smtp.ehlo() 107 | smtp.starttls() 108 | smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) 109 | smtp.send_message(message) 110 | print('Message sent successfully. ') 111 | print() 112 | 113 | 114 | send_email() -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/__pycache__/L3_email_bot.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/__pycache__/L3_email_bot.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_accommodation_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_accommodation_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_customer_feedbacks_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_customer_feedbacks_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_customer_info_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_customer_info_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_bookings_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_bookings_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_destinations_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_destinations_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_promotion_deals_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_promotion_deals_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_schedules_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_schedules_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_ticket_sales_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_flight_ticket_sales_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_sales_agents_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_sales_agents_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_ticket_prices_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/dev/__pycache__/dim_ticket_prices_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/prod/__pycache__/create_sem_prod_env.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/prod/__pycache__/create_sem_prod_env.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/prod/create_sem_prod_env.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import time 4 | import random 5 | import psycopg2 6 | import pandas as pd 7 | import configparser 8 | from pathlib import Path 9 | import logging, coloredlogs 10 | from datetime import datetime 11 | 12 | # ================================================ LOGGER ================================================ 13 | 14 | 15 | # Set up root root_logger 16 | root_logger = logging.getLogger(__name__) 17 | root_logger.setLevel(logging.DEBUG) 18 | 19 | 20 | # Set up formatter for logs 21 | file_handler_log_formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s ') 22 | console_handler_log_formatter = coloredlogs.ColoredFormatter(fmt = '%(message)s', level_styles=dict( 23 | debug = dict (color = 'white'), 24 | info = dict (color = 'green'), 25 | warning = dict (color = 'cyan'), 26 | error = dict (color = 'red', bold = True, bright = True), 27 | critical = dict (color = 'black', bold = True, background = 'red') 28 | ), 29 | 30 | field_styles=dict( 31 | messages = dict (color = 'white') 32 | ) 33 | ) 34 | 35 | 36 | # Set up file handler object for logging events to file 37 | current_filepath = Path(__file__).stem 38 | file_handler = logging.FileHandler('logs/L3_semantic_layer/prod/' + current_filepath + '.log', mode='w') 39 | file_handler.setFormatter(file_handler_log_formatter) 40 | 41 | 42 | # Set up console handler object for writing event logs to console in real time (i.e. streams events to stderr) 43 | console_handler = logging.StreamHandler() 44 | console_handler.setFormatter(console_handler_log_formatter) 45 | 46 | 47 | # Add the file handler 48 | root_logger.addHandler(file_handler) 49 | 50 | 51 | # Only add the console handler if the script is running directly from this location 52 | if __name__=="__main__": 53 | root_logger.addHandler(console_handler) 54 | 55 | 56 | 57 | 58 | # ================================================ CONFIG ================================================ 59 | config = configparser.ConfigParser() 60 | 61 | # Use the local config file from the local machine 62 | path = os.path.abspath('dwh_pipelines/local_config.ini') 63 | config.read(path) 64 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 65 | 66 | host = config['travel_data_filepath']['HOST'] 67 | port = config['travel_data_filepath']['PORT'] 68 | database = config['travel_data_filepath']['SEMANTIC_DB'] 69 | username = config['travel_data_filepath']['USERNAME'] 70 | password = config['travel_data_filepath']['PASSWORD'] 71 | 72 | postgres_connection = None 73 | cursor = None 74 | 75 | 76 | 77 | # Connect to the staging instance of the Postgres data warehouse 78 | 79 | postgres_connection = psycopg2.connect( 80 | host = host, 81 | port = port, 82 | dbname = database, 83 | user = username, 84 | password = password, 85 | ) 86 | postgres_connection.set_session(autocommit=True) 87 | 88 | 89 | 90 | def create_prod_environment_for_semantic(): 91 | # Set up constants 92 | CURRENT_TIMESTAMP = datetime.now() 93 | dev_schema_name = 'dev' 94 | prod_schema_name = 'prod' 95 | active_db_name = database 96 | data_warehouse_layer = 'SEMANTIC' 97 | 98 | 99 | # Create a cursor object to execute the PG-SQL commands 100 | cursor = postgres_connection.cursor() 101 | 102 | 103 | 104 | # Validate the Postgres database connection 105 | if postgres_connection.closed == 0: 106 | root_logger.debug(f"") 107 | root_logger.info("=================================================================================") 108 | root_logger.info(f"CONNECTION SUCCESS: Managed to connect successfully to the {active_db_name} database!!") 109 | root_logger.info(f"Connection details: {postgres_connection.dsn} ") 110 | root_logger.info("=================================================================================") 111 | root_logger.debug("") 112 | 113 | elif postgres_connection.closed != 0: 114 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 115 | 116 | 117 | 118 | # Set up SQL statements for schema creation and validation check 119 | try: 120 | 121 | create_schema = f''' CREATE SCHEMA IF NOT EXISTS {prod_schema_name}; 122 | ''' 123 | 124 | check_if_schema_exists = f''' SELECT schema_name from information_schema.schemata WHERE schema_name= '{prod_schema_name}'; 125 | ''' 126 | 127 | 128 | # Create schema in Postgres 129 | CREATING_SCHEMA_PROCESSING_START_TIME = time.time() 130 | cursor.execute(create_schema) 131 | root_logger.info("") 132 | root_logger.info(f"Successfully created '{prod_schema_name}' schema. ") 133 | root_logger.info("") 134 | CREATING_SCHEMA_PROCESSING_END_TIME = time.time() 135 | 136 | 137 | CREATING_SCHEMA_VAL_CHECK_START_TIME = time.time() 138 | cursor.execute(check_if_schema_exists) 139 | CREATING_SCHEMA_VAL_CHECK_END_TIME = time.time() 140 | 141 | 142 | 143 | sql_result = cursor.fetchone()[0] 144 | if sql_result: 145 | root_logger.debug(f"") 146 | root_logger.info(f"=================================================================================================") 147 | root_logger.info(f"SCHEMA CREATION SUCCESS: Managed to create {prod_schema_name} schema in {active_db_name} ") 148 | root_logger.info(f"Schema name in Postgres: {sql_result} ") 149 | root_logger.info(f"SQL Query for validation check: {check_if_schema_exists} ") 150 | root_logger.info(f"=================================================================================================") 151 | root_logger.debug(f"") 152 | 153 | else: 154 | root_logger.debug(f"") 155 | root_logger.error(f"=================================================================================================") 156 | root_logger.error(f"SCHEMA CREATION FAILURE: Unable to create schema for {active_db_name}...") 157 | root_logger.info(f"SQL Query for validation check: {check_if_schema_exists} ") 158 | root_logger.error(f"=================================================================================================") 159 | root_logger.debug(f"") 160 | 161 | # postgres_connection.commit() 162 | 163 | except Exception as e: 164 | print(e) 165 | 166 | 167 | # Get all the tables from DEV environment 168 | try: 169 | root_logger.debug(f"") 170 | root_logger.debug(f"Now creating '{prod_schema_name}' environment ....") 171 | root_logger.debug(f"") 172 | sql_query = f""" SELECT table_name FROM information_schema.tables WHERE table_schema = '{dev_schema_name}' AND table_name LIKE '%dim%' 173 | """ 174 | cursor.execute(sql_query) 175 | 176 | sql_results = cursor.fetchall() 177 | no_of_sql_results = len(sql_results) 178 | root_logger.debug(f'No of results: {no_of_sql_results} ') 179 | 180 | 181 | for table in sql_results: 182 | table_name = table[0] 183 | root_logger.info(f"") 184 | root_logger.info(f"Now creating '{table_name}' table in production environment ...") 185 | # root_logger.info(f"") 186 | sql_query = f""" CREATE TABLE IF NOT EXISTS {prod_schema_name}.{table_name} as SELECT * FROM {dev_schema_name}.{table_name} 187 | """ 188 | cursor.execute(sql_query) 189 | # root_logger.info(f"") 190 | root_logger.info(f"Successfully created '{table_name}' table in production environment ") 191 | root_logger.info(f"") 192 | 193 | 194 | # postgres_connection.commit() 195 | root_logger.debug(f"") 196 | root_logger.debug(f"Successfully created '{prod_schema_name}' environment. ") 197 | root_logger.debug(f"") 198 | 199 | 200 | 201 | except Exception as e: 202 | print(e) 203 | 204 | 205 | finally: 206 | 207 | # Close the cursor if it exists 208 | if cursor is not None: 209 | cursor.close() 210 | root_logger.debug("") 211 | root_logger.debug("Cursor closed successfully.") 212 | 213 | # Close the database connection to Postgres if it exists 214 | if postgres_connection is not None: 215 | postgres_connection.close() 216 | # root_logger.debug("") 217 | root_logger.debug("Session connected to Postgres database closed.") 218 | 219 | 220 | 221 | create_prod_environment_for_semantic() -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/semantic-to-dwh-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/semantic-to-dwh-diagram.png -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_accommodation_bookings_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_accommodation_bookings_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_customer_feedbacks_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_customer_feedbacks_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_customer_info_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_customer_info_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_bookings_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_bookings_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_destinations_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_destinations_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_promotion_deals_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_promotion_deals_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_schedules_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_schedules_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_ticket_sales_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_flight_ticket_sales_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_sales_agents_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_sales_agents_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_ticket_prices_tbl.cpython-310-pytest-7.2.1.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L3_semantic_layer/tests/__pycache__/test_dim_ticket_prices_tbl.cpython-310-pytest-7.2.1.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L3_semantic_layer/tests/test_dim_flight_promotion_deals_tbl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import pytest 4 | import psycopg2 5 | import configparser 6 | 7 | 8 | 9 | 10 | # ================================================ CONFIG ================================================ 11 | 12 | # Add a flag/switch indicating whether Airflow is in use or not 13 | USING_AIRFLOW = False 14 | 15 | 16 | 17 | # Create a config file for storing environment variables 18 | config = configparser.ConfigParser() 19 | if USING_AIRFLOW: 20 | 21 | # Use the airflow config file from the airflow container 22 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 23 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 24 | 25 | host = config['postgres_airflow_config']['HOST'] 26 | port = config['postgres_airflow_config']['PORT'] 27 | database = config['postgres_airflow_config']['SEMANTIC_DB'] 28 | username = config['postgres_airflow_config']['USERNAME'] 29 | password = config['postgres_airflow_config']['PASSWORD'] 30 | 31 | postgres_connection = None 32 | cursor = None 33 | 34 | 35 | else: 36 | 37 | # Use the local config file from the local machine 38 | path = os.path.abspath('dwh_pipelines/local_config.ini') 39 | config.read(path) 40 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 41 | 42 | host = config['travel_data_filepath']['HOST'] 43 | port = config['travel_data_filepath']['PORT'] 44 | database = config['travel_data_filepath']['SEMANTIC_DB'] 45 | username = config['travel_data_filepath']['USERNAME'] 46 | password = config['travel_data_filepath']['PASSWORD'] 47 | 48 | postgres_connection = None 49 | cursor = None 50 | 51 | 52 | 53 | # Connect to the Postgres database 54 | try: 55 | pgsql_connection = psycopg2.connect( 56 | host = host, 57 | port = port, 58 | dbname = database, 59 | user = username, 60 | password = password, 61 | ) 62 | 63 | 64 | # Create a cursor object to execute the PG-SQL commands 65 | cursor = pgsql_connection.cursor() 66 | 67 | 68 | except psycopg2.Error: 69 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 70 | 71 | 72 | 73 | # Define the database, schema and table names 74 | 75 | table_name = 'dim_flight_promotion_deals_tbl' 76 | schema_name = 'dev' 77 | database_name = database 78 | 79 | 80 | 81 | # ====================================== DATA QUALITY CHECKS ====================================== 82 | # ================================================================================================= 83 | 84 | 85 | 86 | # ====================================== TEST 1: DATABASE CONNECTION CHECK ====================================== 87 | 88 | 89 | """ Test the connection to the Postgres database is successful or not """ 90 | 91 | def test_database_connection(): 92 | 93 | # Assert the existence of a valid connection to the database (i.e. not None) 94 | assert pgsql_connection is not None, f"CONNECTION ERROR: Unable to connect to the {database_name} database... " 95 | 96 | 97 | 98 | 99 | 100 | 101 | # ====================================== TEST 2: SCHEMA EXISTENCE CHECK ====================================== 102 | 103 | 104 | """ Verify the semantic schema exists in the Postgres semantic database """ 105 | 106 | 107 | 108 | def test_schema_existence(): 109 | sql_query = f""" SELECT schema_name FROM information_schema.schemata 110 | """ 111 | cursor.execute(sql_query) 112 | 113 | sql_results = cursor.fetchall() 114 | schemas = [schema[0] for schema in sql_results] 115 | 116 | assert schema_name in schemas, f"The '{schema_name}' schema should be found in the '{database_name}' database. " 117 | 118 | 119 | 120 | 121 | 122 | 123 | # ====================================== TEST 3: COLUMNS EXISTENCE CHECK ====================================== 124 | 125 | """ Verify the columns of this table exists in the Postgres semantic database """ 126 | 127 | 128 | 129 | def test_columns_existence(): 130 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name}' 131 | """ 132 | cursor.execute(sql_query) 133 | 134 | sql_results = cursor.fetchall() 135 | actual_columns = [column[0] for column in sql_results] 136 | 137 | expected_columns = ['promotion_id', 138 | 'promotion_name', 139 | 'flight_booking_id', 140 | 'applied_discount', 141 | 'created_at', 142 | 'updated_at', 143 | 'source_system', 144 | 'source_file', 145 | 'load_timestamp', 146 | 'dwh_layer' 147 | ] 148 | 149 | for expected_column in expected_columns: 150 | assert expected_column in actual_columns, f"The '{expected_column}' column should be in the '{table_name}' table. " 151 | 152 | 153 | 154 | 155 | 156 | # ====================================== TEST 4: TABLE EXISTENCE CHECK ====================================== 157 | 158 | 159 | """ Check if the active table is in the Postgres semantic database """ 160 | 161 | 162 | def test_table_existence(): 163 | sql_query = f""" SELECT * FROM information_schema.tables WHERE table_name = '{table_name}' AND table_schema = '{schema_name}' ; """ 164 | cursor.execute(sql_query) 165 | sql_result = cursor.fetchone() 166 | 167 | assert sql_result is not None, f"The '{table_name}' does not exist in the '{database}.{schema_name}' schema. " 168 | 169 | 170 | 171 | 172 | 173 | # ====================================== TEST 5: DATA TYPES CHECK ====================================== 174 | 175 | 176 | """ Test if each column is mapped to the expected data type in Postgres """ 177 | 178 | 179 | def test_column_data_types(): 180 | 181 | # Create a dictionary that specifies the expected data types for each column 182 | expected_data_types = { 183 | 'flight_booking_sk' : "integer", 184 | 'flight_promotion_deal_sk' : "integer", 185 | 'promotion_id' : "uuid", 186 | 'promotion_name' : "character varying", 187 | 'flight_booking_id' : "uuid", 188 | 'applied_discount' : "numeric", 189 | "created_at" : "timestamp with time zone", 190 | "updated_at" : "timestamp with time zone", 191 | "source_system" : "character varying", 192 | "source_file" : "character varying", 193 | "load_timestamp" : "timestamp without time zone", 194 | "dwh_layer" : "character varying" 195 | 196 | } 197 | 198 | 199 | 200 | # Use SQL to extract the column names and their data types 201 | sql_query = f""" SELECT column_name, data_type from information_schema.columns WHERE table_name = '{table_name}' 202 | """ 203 | cursor.execute(sql_query) 204 | 205 | sql_results = cursor.fetchall() 206 | 207 | for column_name, actual_data_type in sql_results: 208 | assert actual_data_type.lower() == expected_data_types[column_name], f"The expected data type for column '{column_name}' was '{expected_data_types[column_name]}', but the actual data type was '{actual_data_type}'. " 209 | 210 | 211 | 212 | 213 | 214 | # ====================================== TEST 6: EMPTY VALUES CHECK ====================================== 215 | 216 | 217 | """ Check if there are any empty values present in your table """ 218 | 219 | def test_empty_values_in_table(): 220 | sql_query = f""" SELECT * FROM {schema_name}.{table_name} 221 | """ 222 | cursor.execute(sql_query) 223 | sql_results = cursor.fetchall() 224 | 225 | row_no = 0 226 | for record in sql_results: 227 | row_no +=1 228 | for cell_value in record: 229 | assert cell_value is not None, f" There is an empty value in the '{schema_name}.{table_name}' table on row '{row_no}' . " 230 | 231 | 232 | 233 | 234 | 235 | 236 | # ====================================== TEST 7: NULL VALUES CHECK ====================================== 237 | 238 | """ Check if there are any NULL values present in your table """ 239 | 240 | def test_null_values_in_table(): 241 | 242 | # Get list of columns from table 243 | cursor.execute(f""" SELECT column_name from information_schema.columns WHERE table_name = '{table_name}' ; 244 | """) 245 | columns = cursor.fetchall() 246 | 247 | 248 | for column in columns: 249 | sql_query = f'SELECT COUNT(*) FROM {schema_name}.{table_name} WHERE {column[0]} is NULL' 250 | cursor.execute(sql_query) 251 | sql_result = cursor.fetchone() 252 | 253 | assert sql_result[0] == 0, f"The {column} column has NULL values. " 254 | 255 | 256 | 257 | 258 | 259 | 260 | 261 | # ====================================== TEST 8: ID CHARACTER LENGTH CONSTRAINT CHECK ====================================== 262 | 263 | """ Test all the ID columns in the table contain 36 characters in length """ 264 | 265 | def test_id_char_length_constraint(): 266 | expected_id_char_length = 36 267 | sql_results = cursor.fetchall() 268 | 269 | 270 | sql_query = f""" SELECT column_name FROM information_schema.columns WHERE table_name='{table_name} AND column_name LIKE "%_id%" ' 271 | """ 272 | cursor.execute(sql_query) 273 | 274 | sql_results = cursor.fetchall() 275 | 276 | 277 | # Assert the number of characters for the id column is equal to 36 278 | for sql_result in sql_results: 279 | id_column = sql_result[0] 280 | actual_id_length = len(id_column) 281 | assert actual_id_length == expected_id_char_length, f"Invalid ID column found: All ID columns must be {expected_id_char_length} characters long. The ID column containing invalid IDs is '{id_column}' column" 282 | 283 | 284 | 285 | 286 | # ====================================== TEST 9: DUPLICATES CHECK ====================================== 287 | 288 | 289 | """ Test the number of duplicate records appearing in the Postgres table """ 290 | 291 | def test_duplicate_records_count(): 292 | column_name = "promotion_id" 293 | sql_query = f""" SELECT {column_name}, 294 | COUNT (*) 295 | FROM {schema_name}.{table_name} 296 | GROUP BY {column_name} 297 | HAVING COUNT(*) > 1 298 | ; 299 | """ 300 | cursor.execute(sql_query) 301 | 302 | duplicates = cursor.fetchall() 303 | total_no_of_duplicates = len(duplicates) 304 | 305 | # Assert the number of uniqueness constraints for the table specified is at least 1 306 | assert total_no_of_duplicates == 0, f"Duplicate entries detected - {table_name} should contain no duplicate entries." 307 | 308 | 309 | 310 | 311 | def run_tests(): 312 | test_filepath = os.path.abspath('dwh_pipelines/L3_semantic_layer/tests/test_dim_flight_promotion_deals_tbl.py') 313 | test_result = pytest.main([test_filepath]) 314 | return test_result 315 | 316 | 317 | 318 | if __name__ == "__main__": 319 | # Run DQ tests 320 | test_result = run_tests() 321 | 322 | # Create DQ reports in HTML format 323 | from pathlib import Path 324 | import webbrowser 325 | file_path = os.path.abspath(__file__) 326 | current_filepath = Path(__file__).stem 327 | html_report_path = f"{current_filepath}.html" 328 | pytest.main(["-v", "-s", "--capture=tee-sys", file_path, f"--html={html_report_path}", "--self-contained-html"]) 329 | 330 | # Open DQ reports in browser 331 | # dq_report_url = Path.cwd() / html_report_path 332 | # webbrowser.open(dq_report_url.as_uri()) 333 | sys.exit() 334 | -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/L4_email_bot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import smtplib 3 | from datetime import datetime 4 | from email.mime.multipart import MIMEMultipart 5 | from email.mime.base import MIMEBase 6 | from email.mime.text import MIMEText 7 | from email.utils import COMMASPACE 8 | from email import encoders 9 | from pathlib import Path 10 | from dotenv import load_dotenv 11 | 12 | 13 | # Load environment variables from .env 14 | load_dotenv() 15 | 16 | 17 | 18 | # Set up constants 19 | current_filepath = Path(__file__).stem 20 | 21 | SMTP_PORT = 587 22 | SMTP_HOST_SERVER = "smtp.gmail.com" 23 | CURRENT_TIMESTAMP = datetime.now().strftime("%Y-%m-%d %H:%M:%S") 24 | EMAIL_ADDRESS = os.getenv("SENDER") 25 | EMAIL_PASSWORD = os.getenv("EMAIL_PASSWORD") 26 | SENDER = "Postgres Data Warehouse Program - SDW" 27 | RECIPIENT = os.getenv("RECIPIENT") 28 | 29 | 30 | L0_LOG_DIRECTORY = os.getenv("L0_LOG_DIRECTORY") 31 | L1_LOG_DIRECTORY = os.getenv("L1_LOG_DIRECTORY") 32 | L2_LOG_DIRECTORY = os.getenv("L2_LOG_DIRECTORY") 33 | L3_LOG_DIRECTORY = os.getenv("L3_LOG_DIRECTORY") 34 | L4_LOG_DIRECTORY = os.getenv("L4_LOG_DIRECTORY") 35 | 36 | body_main_subject = "creating aggregate tables from the MDM tables of the Postgres data warehouse" 37 | body = f"""Hi Stephen, 38 | 39 | See attached the logs for {body_main_subject}. 40 | 41 | Regards, 42 | {SENDER} 43 | 44 | """ 45 | 46 | 47 | # Create function for getting the directory paths for log files 48 | def get_log_filepaths(log_directory): 49 | log_filepaths = [] 50 | for root, directories, log_files in os.walk(log_directory): 51 | for filename in log_files: 52 | log_filepath = os.path.join(root, filename) 53 | log_filepaths.append(log_filepath) 54 | return log_filepaths 55 | 56 | 57 | # Create function for attaching log files to email 58 | def attach_log_files_to_email(message, log_filepaths): 59 | for log_file in log_filepaths: 60 | with open(log_file, 'rb') as file: 61 | log_attachment = MIMEBase('application', 'octet-stream') 62 | log_attachment.set_payload(file.read()) 63 | encoders.encode_base64(log_attachment) 64 | log_attachment.add_header('Content-Disposition', f'attachment; filename="{os.path.basename(log_file)}"') 65 | message.attach(log_attachment) 66 | 67 | 68 | 69 | 70 | # ===================================== SETTING UP LOG FILE ATTACHMENTS ===================================== 71 | 72 | 73 | # Get directory paths for log files 74 | dwh_layer_log_directory = get_log_filepaths(L4_LOG_DIRECTORY) 75 | 76 | log_file_counter = 0 77 | for log_file in dwh_layer_log_directory: 78 | log_file_counter += 1 79 | print('') 80 | print(f'Log file {log_file_counter}: {log_file} ') 81 | 82 | 83 | 84 | # ===================================== SETTING UP EMAIL MESSAGE ===================================== 85 | 86 | # Set up constants for email 87 | message = MIMEMultipart() 88 | message["From"] = SENDER 89 | message["To"] = RECIPIENT 90 | message["Subject"] = f"L4 - DWH Layer Log Files - {CURRENT_TIMESTAMP}" 91 | 92 | 93 | # Add body to the email message 94 | message.attach(MIMEText(body, "plain")) 95 | 96 | 97 | # Attach log files to email 98 | attach_log_files_to_email(message, dwh_layer_log_directory) 99 | 100 | 101 | 102 | # ===================================== SENDING EMAIL MESSAGE ===================================== 103 | 104 | def send_email(): 105 | with smtplib.SMTP(host=SMTP_HOST_SERVER, port=SMTP_PORT) as smtp: 106 | smtp.ehlo() 107 | smtp.starttls() 108 | smtp.login(EMAIL_ADDRESS, EMAIL_PASSWORD) 109 | smtp.send_message(message) 110 | print('Message sent successfully. ') 111 | print() 112 | 113 | 114 | send_email() 115 | -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/__pycache__/L4_email_bot.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/__pycache__/L4_email_bot.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/ERD - 1st level.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/ERD - 1st level.png -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/ERD - 2nd level.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/ERD - 2nd level.png -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_customers_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_customers_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_date_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_date_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_dates_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_dates_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_destinations_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_destinations_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_flights_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_flights_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_prices_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_prices_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_promotions_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_promotions_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_sales_employees_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_sales_employees_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_schedules_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/dim_schedules_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/fact_accommodations_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/fact_accommodations_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/fact_sales_tbl.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/datamarts/__pycache__/fact_sales_tbl.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/dwh-diagram.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/dwh-diagram.png -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/reporting_channel/__pycache__/app.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/reporting_channel/__pycache__/app.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/reporting_channel/app.py: -------------------------------------------------------------------------------- 1 | import os 2 | import dash 3 | import psycopg2 4 | import configparser 5 | import pandas as pd 6 | from dash import dcc 7 | from dash import html 8 | import dash_bootstrap_components as dbc 9 | from pathlib import Path 10 | import logging, coloredlogs 11 | import plotly.express as px 12 | from sqlalchemy import create_engine 13 | 14 | 15 | # ================================================ LOGGER ================================================ 16 | 17 | 18 | # Set up root root_logger 19 | root_logger = logging.getLogger(__name__) 20 | root_logger.setLevel(logging.DEBUG) 21 | 22 | 23 | # Set up formatter for logs 24 | file_handler_log_formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s ') 25 | console_handler_log_formatter = coloredlogs.ColoredFormatter(fmt = '%(message)s', level_styles=dict( 26 | debug = dict (color = 'white'), 27 | info = dict (color = 'green'), 28 | warning = dict (color = 'cyan'), 29 | error = dict (color = 'red', bold = True, bright = True), 30 | critical = dict (color = 'black', bold = True, background = 'red') 31 | ), 32 | 33 | field_styles=dict( 34 | messages = dict (color = 'white') 35 | ) 36 | ) 37 | 38 | 39 | # Set up file handler object for logging events to file 40 | current_filepath = Path(__file__).stem 41 | file_handler = logging.FileHandler('logs/L4_dwh_layer/user_access_layer/' + current_filepath + '.log', mode='w') 42 | file_handler.setFormatter(file_handler_log_formatter) 43 | 44 | 45 | # Set up console handler object for writing event logs to console in real time (i.e. streams events to stderr) 46 | console_handler = logging.StreamHandler() 47 | console_handler.setFormatter(console_handler_log_formatter) 48 | 49 | 50 | # Add the file handler 51 | root_logger.addHandler(file_handler) 52 | 53 | 54 | # Only add the console handler if the script is running directly from this location 55 | if __name__=="__main__": 56 | root_logger.addHandler(console_handler) 57 | 58 | 59 | 60 | 61 | # ================================================ CONFIG ================================================ 62 | 63 | # Add a flag/switch indicating whether Airflow is in use or not 64 | USING_AIRFLOW = False 65 | 66 | 67 | 68 | # Create a config file for storing environment variables 69 | config = configparser.ConfigParser() 70 | if USING_AIRFLOW: 71 | 72 | # Use the airflow config file from the airflow container 73 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 74 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 75 | 76 | host = config['postgres_airflow_config']['HOST'] 77 | port = config['postgres_airflow_config']['PORT'] 78 | database = config['postgres_airflow_config']['DWH_DB'] 79 | username = config['postgres_airflow_config']['USERNAME'] 80 | password = config['postgres_airflow_config']['PASSWORD'] 81 | 82 | postgres_connection = None 83 | cursor = None 84 | 85 | 86 | else: 87 | 88 | # Use the local config file from the local machine 89 | path = os.path.abspath('dwh_pipelines/local_config.ini') 90 | config.read(path) 91 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 92 | 93 | host = config['travel_data_filepath']['HOST'] 94 | port = config['travel_data_filepath']['PORT'] 95 | database = config['travel_data_filepath']['DWH_DB'] 96 | username = config['travel_data_filepath']['USERNAME'] 97 | password = config['travel_data_filepath']['PASSWORD'] 98 | 99 | postgres_connection = None 100 | cursor = None 101 | 102 | 103 | # Begin the data extraction process 104 | root_logger.info("") 105 | root_logger.info("---------------------------------------------") 106 | root_logger.info("Beginning the dwh process...") 107 | 108 | 109 | postgres_connection = psycopg2.connect( 110 | host = host, 111 | port = port, 112 | dbname = database, 113 | user = username, 114 | password = password, 115 | ) 116 | postgres_connection.set_session(autocommit=True) 117 | 118 | 119 | def render_dash_visualizations(postgres_connection): 120 | try: 121 | 122 | # Set up constants 123 | 124 | active_schema_name = 'reporting' 125 | active_db_name = database 126 | sql_query_1 = f'''SELECT * FROM {active_schema_name}.avg_ticket_prices_by_year ; ''' 127 | sql_query_2 = f'''SELECT * FROM {active_schema_name}.flight_bookings_by_age ; ''' 128 | sql_query_3 = f'''SELECT * FROM {active_schema_name}.top_destinations ; ''' 129 | sql_query_4 = f'''SELECT * FROM {active_schema_name}.total_sales_by_destination ; ''' 130 | sql_query_5 = f'''SELECT * FROM {active_schema_name}.customer_booking_trend ; ''' 131 | sql_query_6 = f'''SELECT * FROM {active_schema_name}.total_sales_by_payment_method ; ''' 132 | sql_query_7 = f'''SELECT * FROM {active_schema_name}.total_sales_by_year ; ''' 133 | sql_alchemy_engine = create_engine(f'postgresql://{username}:{password}@{host}:{port}/{database}') 134 | data_warehouse_layer = 'DWH - UAL' 135 | 136 | 137 | # Validate the Postgres database connection 138 | if postgres_connection.closed == 0: 139 | root_logger.debug(f"") 140 | root_logger.info("=================================================================================") 141 | root_logger.info(f"CONNECTION SUCCESS: Managed to connect successfully to the {active_db_name} database!!") 142 | root_logger.info(f"Connection details: {postgres_connection.dsn} ") 143 | root_logger.info("=================================================================================") 144 | root_logger.debug("") 145 | 146 | elif postgres_connection.closed != 0: 147 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 148 | 149 | 150 | 151 | # ================================================== CREATE DASHBOARD VIA PLOTLY-DASH ======================================= 152 | 153 | 154 | avg_ticket_prices_by_year_df = pd.read_sql(sql_query_1, con=sql_alchemy_engine) 155 | flight_bookings_by_age_df = pd.read_sql(sql_query_2, con=sql_alchemy_engine) 156 | top_destinations_df = pd.read_sql(sql_query_3, con=sql_alchemy_engine) 157 | total_sales_by_destination_df = pd.read_sql(sql_query_4, con=sql_alchemy_engine) 158 | 159 | # Commit the changes made in Postgres 160 | # postgres_connection.commit() 161 | 162 | 163 | # Create Dash app 164 | app = dash.Dash(external_stylesheets=[dbc.themes.BOOTSTRAP]) 165 | 166 | 167 | # Create graphs for dashboard 168 | graph_1 = dcc.Graph( 169 | figure=px.scatter(avg_ticket_prices_by_year_df, x="booking_year", y="arrival_city", size="avg_ticket_price", color="arrival_city", title="Average Ticket Prices by Destination and Year") 170 | ) 171 | 172 | graph_2 = dcc.Graph( 173 | figure=px.bar(flight_bookings_by_age_df, x="age", y="no_of_bookings") 174 | ) 175 | 176 | graph_3 = dcc.Graph( 177 | figure=px.treemap(top_destinations_df, path=['destination'], values="no_of_bookings", color="no_of_bookings", color_continuous_scale="Blues") 178 | ) 179 | 180 | graph_4 = dcc.Graph( 181 | figure=px.bar(top_destinations_df, x="destination", y="no_of_bookings", title="Top 10 Most Booked Destinations") 182 | ) 183 | 184 | graph_5 = dcc.Graph( 185 | figure=px.scatter(total_sales_by_destination_df, x="booking_year", y="arrival_city", size="total_sales", color="arrival_city", title="Total Sales by Destination and Year") 186 | ) 187 | 188 | graph_6 = dcc.Graph( 189 | figure=px.bar(top_destinations_df, x="destination", y="no_of_bookings") 190 | ) 191 | 192 | # Create the layout for the Dash app 193 | app.layout = html.Div( 194 | [ 195 | 196 | dbc.Row( 197 | dbc.Col( 198 | html.H1("Flight Booking Data") 199 | ) 200 | ), 201 | dbc.Row( 202 | dbc.Col( 203 | html.H1("") 204 | ) 205 | ), 206 | 207 | dbc.Row( 208 | dbc.Col( 209 | html.H1("") 210 | ) 211 | ), 212 | dbc.Row( 213 | [ 214 | dbc.Col( 215 | [html.H2("Number of Bookings by Age"), 216 | graph_2], 217 | ), 218 | dbc.Col( 219 | [html.H2("Top Destinations"), 220 | graph_3], 221 | ), 222 | ] 223 | ), 224 | 225 | dbc.Row( 226 | dbc.Col( 227 | [html.H2("Top 10 Most Booked Destinations"), 228 | graph_6] 229 | ), 230 | ), 231 | dbc.Row([ 232 | dbc.Col( 233 | html.Div(""), align="start" 234 | ), 235 | 236 | dbc.Col( 237 | html.Div(""), align="middle" 238 | ), 239 | 240 | dbc.Col( 241 | html.H2("Made by SDW 🚀"), align="end" 242 | )] 243 | 244 | ), 245 | 246 | ] 247 | ) 248 | 249 | 250 | root_logger.info(f'') 251 | root_logger.info('================================================') 252 | 253 | 254 | # Run the app 255 | app.run_server(debug=True) 256 | root_logger.info("Now rendering Dash app....") 257 | 258 | 259 | except psycopg2.Error as e: 260 | root_logger.info(e) 261 | 262 | 263 | render_dash_visualizations(postgres_connection) 264 | 265 | -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/reporting_channel/dash-plotly-travel-dashboard_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/reporting_channel/dash-plotly-travel-dashboard_1.png -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/reporting_channel/dash-plotly-travel-dashboard_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/reporting_channel/dash-plotly-travel-dashboard_2.png -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/reporting_channel/dash-plotly-travel-dashboard_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/reporting_channel/dash-plotly-travel-dashboard_3.png -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/avg_ticket_prices_by_year.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/avg_ticket_prices_by_year.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/customer_booking_trend.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/customer_booking_trend.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/flight_bookings_by_age.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/flight_bookings_by_age.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/ticket_sales_by_age.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/ticket_sales_by_age.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/top_destinations.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/top_destinations.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/total_sales_by_destination.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/total_sales_by_destination.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/total_sales_by_payment_method.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/total_sales_by_payment_method.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/total_sales_by_year.cpython-310.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sdw-online/postgres-dwh/c949c6427c0871f775c4264424c1fc4c3c89e897/dwh_pipelines/L4_dwh_layer/user_access_layer/__pycache__/total_sales_by_year.cpython-310.pyc -------------------------------------------------------------------------------- /dwh_pipelines/dwh_approach.md: -------------------------------------------------------------------------------- 1 | # Approach 2 | 3 | 4 | 5 | # Objective 6 | 7 | To create a centralized platform for the analytics team to analyze customer and flight information. This platform aims to provide insights that will support the team in making informed decisions for enhancing the customer's travel experience. This means the analytics team will require access to a wealth of data from one location (i.e. the data warehouse) allowing them to make data-driven decisions that benefit the company, resulting in a more efficient process that improves customer satisfaction. 8 | 9 | 10 | ## Source data 11 | 12 | Here are some of the tables gathered from the travel source systems (databases, CRMs and other tools): 13 | 14 | - Customer information ------------ [x] 15 | - Flight schedules ------------ [x] 16 | - Ticket price data ------------ [x] 17 | - Flight bookings data ------------ [x] 18 | - Customer feedback ------------ [x] 19 | - Raw customer demographic data ------------ [] 20 | - Flight destination information ------------ [x] 21 | - Flight ticket sales ------------ [x] 22 | - Flight Promotions & Deals ------------ [x] 23 | - Holiday data ------------ [] 24 | - Airline data ------------ [] 25 | - Sales agent data ------------ [x] 26 | - Fight destination revenue ------------ [] 27 | - Accommodation bookings data ------------ [x] 28 | 29 | 30 | ## Layers 31 | 32 | Here are the different layers that make up the proposed data warehouse solution in Postgres: 33 | 34 | * Raw layer - for storing source data in its original state 35 | * Staging layer - for cleaning and framing raw data in a suitable format for pre-computing 36 | * Semantic layer - for pre-computing staged data with business logic to create single version of truth 37 | * Data warehouse layer - for displaying the single version of truth in a unified manner to the downstream users 38 | * Governance layer - for establishing processes, practices and policies for managing the DWH's data 39 | * Orchestration layer - for scheduling and managing pipeline tasks and their dependencies 40 | 41 | 42 | 43 | 44 | ## Raw layer 45 | 46 | ### Macro tasks 47 | 48 | - Load source tables into raw tables 49 | - Highlight sensitive fields 50 | - Add event logging 51 | - Run data profiling checks 52 | 53 | 54 | 55 | 56 | 57 | 58 | *** 59 | 60 | ## Staging layer 61 | 62 | ### Macro tasks 63 | 64 | - Load raw data into staging tables 65 | - Design transformation strategy 66 | - Design DQ constraints and QA tests 67 | - Execute transformation strategy 68 | - Execute DQ tests 69 | - Create DEV and PROD environments (schemas) 70 | 71 | 72 | 73 | 74 | *** 75 | 76 | ## Semantic layer 77 | 78 | ### Macro tasks 79 | - Load staging to semantic tables 80 | - Add surrogate keys to semantic tables 81 | - Add business rules to semantic tables 82 | - Add date dimension table 83 | - Add the date foreign keys to the relevant dim tables 84 | - Define cardinality between tables (via ERD if possible) 85 | - Create data dictionary for the tables 86 | - Create DEV and PROD environments (schemas) 87 | 88 | 89 | 90 | 91 | *** 92 | 93 | ## Data warehouse layer 94 | 95 | ### Macro tasks 96 | - Create fact and dimension tables (dimensional modelling) 97 | - Create aggregated views using the fact and dimension tables 98 | - Document the code in each layer to reduce single point of failure risk 99 | - Conduct regular maintenance activities e.g. performance tuning, backups, system updates 100 | 101 | 102 | 103 | 104 | *** 105 | 106 | ## Governance layer 107 | 108 | ### Macro tasks 109 | - Understand the members of the analytics team that require access to the DWH 110 | - Create custom roles using a role-based access control 111 | - Grant table ownership rights to the delegated roles 112 | - Grant the appropriate schema usage rights to each custom role 113 | - Grant privileges to each custom role around each member's responsibilities in the analytics team 114 | - Create row security policies to control what data is visible to each role within their authorized tables 115 | 116 | 117 | 118 | 119 | *** 120 | 121 | ## Orchestration layer 122 | 123 | ### Macro tasks 124 | 125 | 126 | - Set up workflows and task dependencies in Prefect 127 | - Setup CI/CD pipelines in GitHub Actions to automatically test and deploy changes made to the DWH 128 | 129 | 130 | 131 | 132 | *** 133 | -------------------------------------------------------------------------------- /dwh_pipelines/performance_tuning/dwh_indexes.py: -------------------------------------------------------------------------------- 1 | import os 2 | import psycopg2 3 | import configparser 4 | from pathlib import Path 5 | import logging, coloredlogs 6 | 7 | 8 | # ================================================ LOGGER ================================================ 9 | 10 | 11 | # Set up root root_logger 12 | root_logger = logging.getLogger(__name__) 13 | root_logger.setLevel(logging.DEBUG) 14 | 15 | 16 | # Set up formatter for logs 17 | file_handler_log_formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s ') 18 | console_handler_log_formatter = coloredlogs.ColoredFormatter(fmt = '%(message)s', level_styles=dict( 19 | debug = dict (color = 'white'), 20 | info = dict (color = 'green'), 21 | warning = dict (color = 'cyan'), 22 | error = dict (color = 'red', bold = True, bright = True), 23 | critical = dict (color = 'black', bold = True, background = 'red') 24 | ), 25 | 26 | field_styles=dict( 27 | messages = dict (color = 'white') 28 | ) 29 | ) 30 | 31 | 32 | # Set up file handler object for logging events to file 33 | current_filepath = Path(__file__).stem 34 | file_handler = logging.FileHandler('logs/governance/' + current_filepath + '.log', mode='w') 35 | file_handler.setFormatter(file_handler_log_formatter) 36 | 37 | 38 | # Set up console handler object for writing event logs to console in real time (i.e. streams events to stderr) 39 | console_handler = logging.StreamHandler() 40 | console_handler.setFormatter(console_handler_log_formatter) 41 | 42 | 43 | # Add the file handler 44 | root_logger.addHandler(file_handler) 45 | 46 | 47 | # Only add the console handler if the script is running directly from this location 48 | if __name__=="__main__": 49 | root_logger.addHandler(console_handler) 50 | 51 | 52 | 53 | 54 | # ================================================ CONFIG ================================================ 55 | 56 | # Add a flag/switch indicating whether Airflow is in use or not 57 | USING_AIRFLOW = False 58 | 59 | 60 | 61 | # Create a config file for storing environment variables 62 | config = configparser.ConfigParser() 63 | if USING_AIRFLOW: 64 | 65 | # Use the airflow config file from the airflow container 66 | config.read('/usr/local/airflow/dags/etl_to_postgres/airflow_config.ini') 67 | DATASETS_LOCATION_PATH = config['postgres_airflow_config']['DATASET_SOURCE_PATH'] 68 | 69 | host = config['postgres_airflow_config']['HOST'] 70 | port = config['postgres_airflow_config']['PORT'] 71 | database = config['postgres_airflow_config']['DWH_DB'] 72 | username = config['postgres_airflow_config']['USERNAME'] 73 | password = config['postgres_airflow_config']['PASSWORD'] 74 | 75 | postgres_connection = None 76 | cursor = None 77 | 78 | 79 | else: 80 | 81 | # Use the local config file from the local machine 82 | path = os.path.abspath('dwh_pipelines/local_config.ini') 83 | config.read(path) 84 | DATASETS_LOCATION_PATH = config['travel_data_filepath']['DATASETS_LOCATION_PATH'] 85 | 86 | host = config['travel_data_filepath']['HOST'] 87 | port = config['travel_data_filepath']['PORT'] 88 | database = config['travel_data_filepath']['DWH_DB'] 89 | username = config['travel_data_filepath']['USERNAME'] 90 | password = config['travel_data_filepath']['PASSWORD'] 91 | 92 | postgres_connection = None 93 | cursor = None 94 | 95 | 96 | # Begin the data extraction process 97 | root_logger.info("") 98 | root_logger.info("---------------------------------------------") 99 | root_logger.info("Beginning the dwh process...") 100 | 101 | 102 | postgres_connection = psycopg2.connect( 103 | host = host, 104 | port = port, 105 | dbname = database, 106 | user = username, 107 | password = password, 108 | ) 109 | postgres_connection.set_session(autocommit=True) 110 | 111 | 112 | def set_up_access_controls(postgres_connection): 113 | try: 114 | 115 | # Set up constants 116 | 117 | cursor = postgres_connection.cursor() 118 | active_db_name = database 119 | raw_db = config['travel_data_filepath']['RAW_DB'] 120 | staging_db = config['travel_data_filepath']['STAGING_DB'] 121 | semantic_db = config['travel_data_filepath']['SEMANTIC_DB'] 122 | dwh_db = config['travel_data_filepath']['DWH_DB'] 123 | custom_roles = ['junior_data_analyst', 124 | 'senior_data_analyst', 125 | 'junior_data_engineer', 126 | 'senior_data_engineer', 127 | 'junior_data_scientist', 128 | 'senior_data_scientist' 129 | ] 130 | 131 | raw_main_schema = 'main' 132 | 133 | staging_dev_schema = 'dev' 134 | staging_prod_schema = 'prod' 135 | 136 | semantic_dev_schema = 'dev' 137 | semantic_prod_schema = 'prod' 138 | 139 | dwh_reporting_schema = 'reporting' 140 | dwh_live_schema = 'live' 141 | 142 | 143 | # For creating indexes 144 | table_1 = 'dim_destinations_tbl' 145 | table_2 = 'dim_flights_tbl' 146 | 147 | index_1 = 'idx_arrival_city' 148 | index_2 = 'idx_departure_city' 149 | index_3 = 'idx_ticket_price' 150 | 151 | column_1 = 'arrival_city' 152 | column_2 = 'departure_city' 153 | column_3 = 'ticket_price' 154 | 155 | 156 | create_index_sql_query_1 = f''' CREATE INDEX {index_1} ON {dwh_live_schema}.{table_1}({column_1}); 157 | ''' 158 | create_index_sql_query_2 = f''' CREATE INDEX {index_2} ON {dwh_live_schema}.{table_1}({column_2}); 159 | ''' 160 | create_index_sql_query_3 = f''' CREATE INDEX {index_3} ON {dwh_live_schema}.{table_2}({column_3}); 161 | ''' 162 | 163 | 164 | 165 | # Validate the Postgres database connection 166 | if postgres_connection.closed == 0: 167 | root_logger.debug(f"") 168 | root_logger.info("=================================================================================") 169 | root_logger.info(f"CONNECTION SUCCESS: Managed to connect successfully to the {active_db_name} database!!") 170 | root_logger.info(f"Connection details: {postgres_connection.dsn} ") 171 | root_logger.info("=================================================================================") 172 | root_logger.debug("") 173 | 174 | elif postgres_connection.closed != 0: 175 | raise ConnectionError("CONNECTION ERROR: Unable to connect to the demo_company database...") 176 | 177 | 178 | 179 | 180 | 181 | # ================================================== CREATE INDEXES ======================================= 182 | 183 | try: 184 | root_logger.info(f'=========================================== CREATE INDEXES =======================================') 185 | root_logger.info(f'======================================================================================================') 186 | root_logger.info(f'') 187 | root_logger.info(f'') 188 | 189 | cursor.execute(create_index_sql_query_1) 190 | # postgres_connection.commit() 191 | root_logger.info(f'''Successfully created index '{index_1}' index for table '{table_1}' table on '{column_1}' column ''') 192 | root_logger.info(f'-------------------------------------------------------------') 193 | root_logger.info(f'') 194 | root_logger.info(f'') 195 | root_logger.info(f'') 196 | root_logger.info(f'') 197 | 198 | 199 | cursor.execute(create_index_sql_query_2) 200 | # postgres_connection.commit() 201 | root_logger.info(f'''Successfully created index '{index_2}' index for table '{table_1}' table on '{column_2}' column ''') 202 | root_logger.info(f'-------------------------------------------------------------') 203 | root_logger.info(f'') 204 | root_logger.info(f'') 205 | root_logger.info(f'') 206 | root_logger.info(f'') 207 | 208 | 209 | cursor.execute(create_index_sql_query_3) 210 | # postgres_connection.commit() 211 | root_logger.info(f'''Successfully created index '{index_3}' index for table '{table_2}' table on '{column_3}' column ''') 212 | root_logger.info(f'-------------------------------------------------------------') 213 | root_logger.info(f'') 214 | root_logger.info(f'') 215 | root_logger.info(f'') 216 | root_logger.info(f'') 217 | except psycopg2.Error as e: 218 | root_logger.error(e) 219 | 220 | 221 | except psycopg2.Error as e: 222 | root_logger.error(e) 223 | 224 | 225 | set_up_access_controls(postgres_connection) 226 | 227 | 228 | 229 | 230 | # Miscellaneous scripts 231 | 232 | ''' 233 | 234 | -- Check if indexes exist 235 | SELECT * FROM pg_indexes WHERE schemaname != 'pg_catalog' 236 | 237 | ''' -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: postgres_dwh 2 | channels: 3 | - conda-forge 4 | dependencies: 5 | - python=3.10 6 | - pandas 7 | - faker 8 | - path 9 | - psycopg2 10 | - configparser -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | aiosqlite==0.18.0 2 | alembic==1.9.2 3 | anyio==3.6.2 4 | apprise==1.2.1 5 | asgi-lifespan==2.0.0 6 | asyncpg==0.27.0 7 | attrs==22.2.0 8 | cachetools==5.3.0 9 | certifi==2022.12.7 10 | cffi==1.15.1 11 | charset-normalizer==3.0.1 12 | click==8.1.3 13 | cloudpickle==2.2.1 14 | color-it==2.1.3 15 | colorama==0.4.6 16 | coloredlogs==15.0.1 17 | configparser==5.3.0 18 | coolname==2.2.0 19 | croniter==1.3.8 20 | cryptography==39.0.0 21 | dash==2.8.1 22 | dash-core-components==2.0.0 23 | dash-html-components==2.0.0 24 | dash-table==5.0.0 25 | dateparser==1.1.7 26 | docker==6.0.1 27 | docx==0.2.4 28 | exceptiongroup==1.1.0 29 | Faker @ file:///home/conda/feedstock_root/build_artifacts/faker_1674662398228/work 30 | fastapi==0.89.1 31 | Flask==2.2.3 32 | fsspec==2023.1.0 33 | google-auth==2.16.0 34 | greenlet==2.0.2 35 | griffe==0.25.4 36 | h11==0.14.0 37 | h2==4.1.0 38 | hpack==4.0.0 39 | httpcore==0.16.3 40 | httpx==0.23.3 41 | humanfriendly==10.0 42 | hyperframe==6.0.1 43 | idna==3.4 44 | iniconfig==2.0.0 45 | itsdangerous==2.1.2 46 | Jinja2==3.1.2 47 | jsonpatch==1.32 48 | jsonpointer==2.3 49 | jsonschema==4.17.3 50 | kubernetes==25.3.0 51 | lxml==4.9.2 52 | Mako==1.2.4 53 | Markdown==3.4.1 54 | markdown-it-py==2.1.0 55 | MarkupSafe==2.1.2 56 | mdurl==0.1.2 57 | numpy @ file:///D:/bld/numpy_1675642725500/work 58 | oauthlib==3.2.2 59 | orjson==3.8.5 60 | packaging==23.0 61 | pandas @ file:///D:/bld/pandas_1674136614909/work 62 | path @ file:///D:/bld/path_1672072831285/work 63 | pathspec==0.11.0 64 | pendulum==2.1.2 65 | Pillow==9.4.0 66 | plotly==5.13.0 67 | pluggy==1.0.0 68 | prefect==2.8.0 69 | psycopg2 @ file:///D:/bld/psycopg2-split_1672159353062/work 70 | py==1.11.0 71 | pyasn1==0.4.8 72 | pyasn1-modules==0.2.8 73 | pycparser==2.21 74 | pydantic==1.10.4 75 | Pygments==2.14.0 76 | PyPDF2==3.0.1 77 | pyreadline3==3.4.1 78 | pyrsistent==0.19.3 79 | pytesseract==0.3.10 80 | pytest==7.2.1 81 | pytest-html==3.2.0 82 | pytest-metadata==2.0.4 83 | python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1626286286081/work 84 | python-docx==0.8.11 85 | python-dotenv==1.0.0 86 | python-slugify==8.0.0 87 | pytz @ file:///home/conda/feedstock_root/build_artifacts/pytz_1673864280276/work 88 | pytz-deprecation-shim==0.1.0.post0 89 | pytzdata==2020.1 90 | pywin32==305 91 | PyYAML==6.0 92 | readchar==4.0.3 93 | regex==2022.10.31 94 | requests==2.28.2 95 | requests-oauthlib==1.3.1 96 | rfc3986==1.5.0 97 | rich==13.3.1 98 | rsa==4.9 99 | six @ file:///home/conda/feedstock_root/build_artifacts/six_1620240208055/work 100 | sniffio==1.3.0 101 | SQLAlchemy==1.4.46 102 | starlette==0.22.0 103 | tenacity==8.2.1 104 | text-unidecode==1.3 105 | toml==0.10.2 106 | tomli==2.0.1 107 | typer==0.7.0 108 | typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1665144421445/work 109 | tzdata==2022.7 110 | tzlocal==4.2 111 | urllib3==1.26.14 112 | uvicorn==0.20.0 113 | websocket-client==1.5.1 114 | websockets==10.4 115 | Werkzeug==2.2.3 116 | --------------------------------------------------------------------------------