├── README.md ├── airflow-sql ├── .dockerignore ├── .gitignore ├── Dockerfile ├── airflow.testcfg ├── config │ ├── airflow - backup.cfg │ └── airflow.cfg ├── dags │ └── dbt_orchestration.py ├── docker-compose.yml ├── profile │ ├── .user.yml │ └── profiles.yml └── warehouse │ ├── .gitignore │ ├── .user.yml │ ├── .vscode │ └── settings.json │ ├── README.md │ ├── analyses │ └── .gitkeep │ ├── archive │ ├── stg_salesorderheadersalesreason.sql │ ├── stg_salesreason.sql │ ├── stg_store.sql │ └── testDate.sql │ ├── dbt_project.yml │ ├── macros │ ├── .gitkeep │ └── generate_schema_name.sql │ ├── models │ ├── mart │ │ ├── address.sql │ │ ├── customer.sql │ │ ├── date.sql │ │ ├── orderstatus.sql │ │ ├── product.sql │ │ ├── sales.sql │ │ ├── schema.yml │ │ └── territory.sql │ └── staging │ │ ├── schema.yml │ │ ├── source.yml │ │ ├── stg_address.sql │ │ ├── stg_country.sql │ │ ├── stg_countryregion.sql │ │ ├── stg_customer.sql │ │ ├── stg_date.sql │ │ ├── stg_entityaddress.sql │ │ ├── stg_person.sql │ │ ├── stg_product.sql │ │ ├── stg_productcategory.sql │ │ ├── stg_productsubcategory.sql │ │ ├── stg_salesorderdetail.sql │ │ ├── stg_salesorderheader.sql │ │ ├── stg_salesterritory.sql │ │ └── stg_stateprovince.sql │ ├── package-lock.yml │ ├── packages.yml │ ├── profiles.yml │ ├── seeds │ ├── .gitkeep │ └── country_codes.csv │ ├── snapshots │ ├── .gitkeep │ ├── product_snapshot.sql │ └── source.yml │ └── tests │ └── .gitkeep ├── dbtDAGs.png ├── sql ├── dimCustomer.sql ├── dimDate.sql ├── readme.md ├── stg_entityaddress.sql ├── stg_salesorderheader.sql ├── vw_countryregion.sql ├── vw_person.sql ├── vw_product.sql ├── vw_productcategory.sql ├── vw_productsubcategory.sql ├── vw_salesorderheader.sql ├── vw_salesreason.sql ├── vw_salesterritory.sql ├── vw_stateprovince.sql └── vw_store.sql ├── star-schema-example1.png └── warehouse ├── .gitignore ├── .user.yml ├── .vscode └── settings.json ├── README.md ├── analyses ├── .gitkeep ├── create table product.sql ├── update_modified_date_on_product_task.sql ├── update_product_table_script.sql └── update_product_task_on.sql ├── datawarehouse_dagster ├── __pycache__ │ └── __init__.cpython-39.pyc ├── datawarehouse_dagster │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-39.pyc │ │ ├── airbyte.cpython-39.pyc │ │ ├── assets.cpython-39.pyc │ │ ├── constants.cpython-39.pyc │ │ ├── dbt.cpython-39.pyc │ │ ├── definitions.cpython-39.pyc │ │ └── schedules.cpython-39.pyc │ ├── airbyte.py │ ├── constants.py │ ├── dbt.py │ ├── definitions.py │ └── schedules.py ├── pyproject.toml └── setup.py ├── dbt_project.yml ├── macros ├── .gitkeep └── generate_schema_name.sql ├── models ├── mart │ ├── dimAddress.sql │ ├── dimCustomer.sql │ ├── dimDate.sql │ ├── dimOrderStatus.sql │ ├── dimProduct.sql │ ├── dimTerritory.sql │ ├── fctSales.sql │ └── schema.yml └── staging │ ├── schema.yml │ ├── source.yml │ ├── stg_address.sql │ ├── stg_countryregion.sql │ ├── stg_customer.sql │ ├── stg_date.sql │ ├── stg_entityaddress.sql │ ├── stg_person.sql │ ├── stg_product.sql │ ├── stg_productcategory.sql │ ├── stg_productsubcategory.sql │ ├── stg_salesorderdetail.sql │ ├── stg_salesorderheader.sql │ ├── stg_salesterritory.sql │ └── stg_stateprovince.sql ├── packages.yml ├── profiles.yml ├── schema.py ├── seeds ├── .gitkeep └── countryisocodes.csv ├── snapshots ├── .gitkeep ├── product_snapshot.sql └── source.yml ├── source.py └── tests └── .gitkeep /README.md: -------------------------------------------------------------------------------- 1 | # dbt build a datawarehouse using dimensional modeling 2 | This is a dbt repo. In this repo we build a practical datawarehouse using Kimball dimensional model with dbt. 3 | 4 | We use a transactional database, SQL Server, [AdventureWorks2019](https://learn.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-ver16&tabs=ssms) as our source. We extract and load data with an EL tool [Airbyte](https://www.youtube.com/watch?v=2FvMa7vaxDY&t). 5 | 6 | ## Project Setup 7 | You will need the following Tech Stack to following along with this project. 8 | 9 | - You can setup SQL Server's environment using the [following guidelines](https://www.youtube.com/watch?v=e5mvoKuV3xs&t=6s) 10 | - PostgreSQL setup guide [link](https://www.youtube.com/watch?v=fjYiWXHI7Mo) 11 | - Airbyte setup [guide](https://www.youtube.com/watch?v=2FvMa7vaxDY&t) 12 | - dbt setup [guide](https://www.youtube.com/watch?v=gH1w4OIgXj4) 13 | - Python 3.8 or above [installation guide](https://www.youtube.com/watch?v=B0G-44dqHRM&t) 14 | - dbt 1.4.5 or above 15 | - dbt postgres plugin 1.4.5 or above 16 | 17 | Using dbt we transform this data into dimensions and facts. 18 | 19 | 20 | ## Source tables/views used from AdventureWorks datatabase 21 | | schemaname | tablename | type | 22 | |------------ |----------------------------- |------- | 23 | | source | address | table | 24 | | source | businessentityaddress | table | 25 | | source | customer | table | 26 | | source | salesorderdetail | table | 27 | | source | salesorderheadersalesreason | table | 28 | | source | salesorderheader | table | 29 | | source | vw_countryregion | view | 30 | | source | vw_product | view | 31 | | source | vw_productcategory | view | 32 | | source | vw_person | view | 33 | | source | vw_store | view | 34 | | source | vw_salesreason | view | 35 | | source | vw_salesterritory | view | 36 | | source | vw_productsubcategory | view | 37 | | source | vw_stateprovince | view | 38 | | source | vw_salesorderheader | view | 39 | 40 | ## Dimensional Modeling 101 41 | To understand Kimball’s approach to data modeling, we should begin by talking about the star schema. The star schema is a particular way of organizing data for analytical purposes. It consists of two types of tables: 42 | - A fact table, which acts as the primary table for the schema. A fact table contains the primary measurements, metrics, or ‘facts’ of a business process. 43 | - Many dimension tables associated with the fact table. Each dimension table contains ‘dimensions’ — that is, descriptive attributes of the fact table. 44 | 45 | ![image](star-schema-example1.png) 46 | 47 | ## Advantages of Dimensional Modelling 48 | - Dimensional data modeling enables users to easily access data through simple queries, reducing the time and effort required to retrieve and analyze data. 49 | - The simple structure of dimensional data modeling allows for faster query performance, particularly when compared to relational data models. 50 | - Dimensional data modeling allows for more flexible data analysis, as users can quickly and easily explore relationships between data. 51 | - Dimensional data modeling can improve data quality by reducing redundancy and inconsistencies in the data. 52 | - Dimensional data modeling uses simple, intuitive structures that are easy to understand, even for non-technical users. 53 | These dimensional tables 'surround' the fact table, which is where the name 'star schema' comes from. 54 | 55 | ## DBT Completed DAG 56 | ![image](dbtDAGs.png) 57 | -------------------------------------------------------------------------------- /airflow-sql/.dockerignore: -------------------------------------------------------------------------------- 1 | .git -------------------------------------------------------------------------------- /airflow-sql/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | 106 | # Mac 107 | .DS_Store 108 | -------------------------------------------------------------------------------- /airflow-sql/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM apache/airflow:2.9.0 2 | 3 | RUN pip install dbt-postgres==1.8.2 \ 4 | && pip install markupsafe==2.0.1 \ 5 | && pip install apache-airflow-providers-postgres \ 6 | && pip install apache-airflow-providers-odbc \ 7 | && pip install psycopg2-binary \ 8 | && pip install pyodbc \ 9 | && pip install apache-airflow-providers-microsoft-mssql \ 10 | && pip install apache-airflow-providers-microsoft-mssql[odbc] \ 11 | && pip install apache-airflow-providers-microsoft-azure \ 12 | && pip install gitpython \ 13 | && pip install apache-airflow-providers-airbyte[http] \ 14 | && pip install apache-airflow-providers-airbyte \ 15 | && pip install oracledb --upgrade \ 16 | && pip install apache-airflow-providers-common-sql \ 17 | && pip install apache-airflow-providers-oracle \ 18 | && pip install apache-airflow-providers-oracle[common.sql] \ 19 | && pip install dbt-airflow \ 20 | && pip install plyvel \ 21 | && pip install --upgrade cmake \ 22 | && pip install --upgrade pyarrow==14.0.0 \ 23 | && pip install airflow-provider-great-expectations \ 24 | && pip uninstall dbt \ 25 | && pip install dbt-core==1.8.2 26 | 27 | 28 | USER root 29 | RUN sudo apt-key adv --keyserver hkp://keyserver.ubuntu.com:80 --recv-keys B7B3B788A8D3785C 30 | RUN sudo apt-get update 31 | RUN sudo apt-get install -y git 32 | 33 | USER airflow 34 | RUN airflow db migrate 35 | #RUN airflow db init 36 | #RUN airflow db upgrade -------------------------------------------------------------------------------- /airflow-sql/airflow.testcfg: -------------------------------------------------------------------------------- 1 | [smtp] 2 | smtp_host = 192.168.1.39 3 | smtp_starttls = True 4 | smtp_ssl = False 5 | smtp_user = hnawaz@localmail.com 6 | smtp_password = aneela01 7 | smtp_port = 25 8 | smtp_mail_from = admin@localmail.com -------------------------------------------------------------------------------- /airflow-sql/config/airflow - backup.cfg: -------------------------------------------------------------------------------- 1 | #[core] 2 | [database] 3 | # The folder where your airflow pipelines live, most likely a 4 | # subfolder in a code repository. This path must be absolute. 5 | dags_folder = /opt/airflow/dags 6 | 7 | # Hostname by providing a path to a callable, which will resolve the hostname. 8 | # The format is "package.function". 9 | # 10 | # For example, default value "socket.getfqdn" means that result from getfqdn() of "socket" 11 | # package will be used as hostname. 12 | # 13 | # No argument should be required in the function specified. 14 | # If using IP address as hostname is preferred, use value ``airflow.utils.net.get_host_ip_address`` 15 | hostname_callable = socket.getfqdn 16 | 17 | # Default timezone in case supplied date times are naive 18 | # can be utc (default), system, or any IANA timezone string (e.g. Europe/Amsterdam) 19 | default_timezone = utc 20 | 21 | # The executor class that airflow should use. Choices include 22 | # ``SequentialExecutor``, ``LocalExecutor``, ``CeleryExecutor``, ``DaskExecutor``, 23 | # ``KubernetesExecutor``, ``CeleryKubernetesExecutor`` or the 24 | # full import path to the class when using a custom executor. 25 | executor = SequentialExecutor 26 | 27 | # The SqlAlchemy connection string to the metadata database. 28 | # SqlAlchemy supports many different database engines. 29 | # More information here: 30 | # http://airflow.apache.org/docs/apache-airflow/stable/howto/set-up-database.html#database-uri 31 | sql_alchemy_conn = sqlite:////opt/airflow/airflow.db 32 | 33 | # The encoding for the databases 34 | sql_engine_encoding = utf-8 35 | 36 | # Collation for ``dag_id``, ``task_id``, ``key`` columns in case they have different encoding. 37 | # By default this collation is the same as the database collation, however for ``mysql`` and ``mariadb`` 38 | # the default is ``utf8mb3_bin`` so that the index sizes of our index keys will not exceed 39 | # the maximum size of allowed index when collation is set to ``utf8mb4`` variant 40 | # (see https://github.com/apache/airflow/pull/17603#issuecomment-901121618). 41 | # sql_engine_collation_for_ids = 42 | 43 | # If SqlAlchemy should pool database connections. 44 | sql_alchemy_pool_enabled = True 45 | 46 | # The SqlAlchemy pool size is the maximum number of database connections 47 | # in the pool. 0 indicates no limit. 48 | sql_alchemy_pool_size = 5 49 | 50 | # The maximum overflow size of the pool. 51 | # When the number of checked-out connections reaches the size set in pool_size, 52 | # additional connections will be returned up to this limit. 53 | # When those additional connections are returned to the pool, they are disconnected and discarded. 54 | # It follows then that the total number of simultaneous connections the pool will allow 55 | # is pool_size + max_overflow, 56 | # and the total number of "sleeping" connections the pool will allow is pool_size. 57 | # max_overflow can be set to ``-1`` to indicate no overflow limit; 58 | # no limit will be placed on the total number of concurrent connections. Defaults to ``10``. 59 | sql_alchemy_max_overflow = 10 60 | 61 | # The SqlAlchemy pool recycle is the number of seconds a connection 62 | # can be idle in the pool before it is invalidated. This config does 63 | # not apply to sqlite. If the number of DB connections is ever exceeded, 64 | # a lower config value will allow the system to recover faster. 65 | sql_alchemy_pool_recycle = 1800 66 | 67 | # Check connection at the start of each connection pool checkout. 68 | # Typically, this is a simple statement like "SELECT 1". 69 | # More information here: 70 | # https://docs.sqlalchemy.org/en/13/core/pooling.html#disconnect-handling-pessimistic 71 | sql_alchemy_pool_pre_ping = True 72 | 73 | # The schema to use for the metadata database. 74 | # SqlAlchemy supports databases with the concept of multiple schemas. 75 | sql_alchemy_schema = 76 | 77 | # Import path for connect args in SqlAlchemy. Defaults to an empty dict. 78 | # This is useful when you want to configure db engine args that SqlAlchemy won't parse 79 | # in connection string. 80 | # See https://docs.sqlalchemy.org/en/13/core/engines.html#sqlalchemy.create_engine.params.connect_args 81 | # sql_alchemy_connect_args = 82 | 83 | # This defines the maximum number of task instances that can run concurrently in Airflow 84 | # regardless of scheduler count and worker count. Generally, this value is reflective of 85 | # the number of task instances with the running state in the metadata database. 86 | parallelism = 32 87 | 88 | # The maximum number of task instances allowed to run concurrently in each DAG. To calculate 89 | # the number of tasks that is running concurrently for a DAG, add up the number of running 90 | # tasks for all DAG runs of the DAG. This is configurable at the DAG level with ``max_active_tasks``, 91 | # which is defaulted as ``max_active_tasks_per_dag``. 92 | # 93 | # An example scenario when this would be useful is when you want to stop a new dag with an early 94 | # start date from stealing all the executor slots in a cluster. 95 | max_active_tasks_per_dag = 16 96 | 97 | # Are DAGs paused by default at creation 98 | dags_are_paused_at_creation = True 99 | 100 | # The maximum number of active DAG runs per DAG. The scheduler will not create more DAG runs 101 | # if it reaches the limit. This is configurable at the DAG level with ``max_active_runs``, 102 | # which is defaulted as ``max_active_runs_per_dag``. 103 | max_active_runs_per_dag = 16 104 | 105 | # Whether to load the DAG examples that ship with Airflow. It's good to 106 | # get started, but you probably want to set this to ``False`` in a production 107 | # environment 108 | load_examples = True 109 | 110 | # Whether to load the default connections that ship with Airflow. It's good to 111 | # get started, but you probably want to set this to ``False`` in a production 112 | # environment 113 | load_default_connections = True 114 | 115 | # Path to the folder containing Airflow plugins 116 | plugins_folder = /opt/airflow/plugins 117 | 118 | # Should tasks be executed via forking of the parent process ("False", 119 | # the speedier option) or by spawning a new python process ("True" slow, 120 | # but means plugin changes picked up by tasks straight away) 121 | execute_tasks_new_python_interpreter = False 122 | 123 | # Secret key to save connection passwords in the db 124 | fernet_key = 125 | 126 | # Whether to disable pickling dags 127 | donot_pickle = True 128 | 129 | # How long before timing out a python file import 130 | dagbag_import_timeout = 30.0 131 | 132 | # Should a traceback be shown in the UI for dagbag import errors, 133 | # instead of just the exception message 134 | dagbag_import_error_tracebacks = True 135 | 136 | # If tracebacks are shown, how many entries from the traceback should be shown 137 | dagbag_import_error_traceback_depth = 2 138 | 139 | # How long before timing out a DagFileProcessor, which processes a dag file 140 | dag_file_processor_timeout = 50 141 | 142 | # The class to use for running task instances in a subprocess. 143 | # Choices include StandardTaskRunner, CgroupTaskRunner or the full import path to the class 144 | # when using a custom task runner. 145 | task_runner = StandardTaskRunner 146 | 147 | # If set, tasks without a ``run_as_user`` argument will be run with this user 148 | # Can be used to de-elevate a sudo user running Airflow when executing tasks 149 | default_impersonation = 150 | 151 | # What security module to use (for example kerberos) 152 | security = 153 | 154 | # Turn unit test mode on (overwrites many configuration options with test 155 | # values at runtime) 156 | unit_test_mode = False 157 | 158 | # Whether to enable pickling for xcom (note that this is insecure and allows for 159 | # RCE exploits). 160 | enable_xcom_pickling = False 161 | 162 | # When a task is killed forcefully, this is the amount of time in seconds that 163 | # it has to cleanup after it is sent a SIGTERM, before it is SIGKILLED 164 | killed_task_cleanup_time = 60 165 | 166 | # Whether to override params with dag_run.conf. If you pass some key-value pairs 167 | # through ``airflow dags backfill -c`` or 168 | # ``airflow dags trigger -c``, the key-value pairs will override the existing ones in params. 169 | dag_run_conf_overrides_params = True 170 | 171 | # When discovering DAGs, ignore any files that don't contain the strings ``DAG`` and ``airflow``. 172 | dag_discovery_safe_mode = True 173 | 174 | # The number of retries each task is going to have by default. Can be overridden at dag or task level. 175 | default_task_retries = 0 176 | 177 | # The weighting method used for the effective total priority weight of the task 178 | default_task_weight_rule = downstream 179 | 180 | # Updating serialized DAG can not be faster than a minimum interval to reduce database write rate. 181 | min_serialized_dag_update_interval = 30 182 | 183 | # Fetching serialized DAG can not be faster than a minimum interval to reduce database 184 | # read rate. This config controls when your DAGs are updated in the Webserver 185 | min_serialized_dag_fetch_interval = 10 186 | 187 | # Maximum number of Rendered Task Instance Fields (Template Fields) per task to store 188 | # in the Database. 189 | # All the template_fields for each of Task Instance are stored in the Database. 190 | # Keeping this number small may cause an error when you try to view ``Rendered`` tab in 191 | # TaskInstance view for older tasks. 192 | max_num_rendered_ti_fields_per_task = 30 193 | 194 | # On each dagrun check against defined SLAs 195 | check_slas = True 196 | 197 | # Path to custom XCom class that will be used to store and resolve operators results 198 | # Example: xcom_backend = path.to.CustomXCom 199 | xcom_backend = airflow.models.xcom.BaseXCom 200 | 201 | # By default Airflow plugins are lazily-loaded (only loaded when required). Set it to ``False``, 202 | # if you want to load plugins whenever 'airflow' is invoked via cli or loaded from module. 203 | lazy_load_plugins = True 204 | 205 | # By default Airflow providers are lazily-discovered (discovery and imports happen only when required). 206 | # Set it to False, if you want to discover providers whenever 'airflow' is invoked via cli or 207 | # loaded from module. 208 | lazy_discover_providers = True 209 | 210 | # Number of times the code should be retried in case of DB Operational Errors. 211 | # Not all transactions will be retried as it can cause undesired state. 212 | # Currently it is only used in ``DagFileProcessor.process_file`` to retry ``dagbag.sync_to_db``. 213 | max_db_retries = 3 214 | 215 | # Hide sensitive Variables or Connection extra json keys from UI and task logs when set to True 216 | # 217 | # (Connection passwords are always hidden in logs) 218 | hide_sensitive_var_conn_fields = True 219 | 220 | # A comma-separated list of extra sensitive keywords to look for in variables names or connection's 221 | # extra JSON. 222 | sensitive_var_conn_names = 223 | 224 | # Task Slot counts for ``default_pool``. This setting would not have any effect in an existing 225 | # deployment where the ``default_pool`` is already created. For existing deployments, users can 226 | # change the number of slots using Webserver, API or the CLI 227 | default_pool_task_slot_count = 128 228 | 229 | [logging] 230 | # The folder where airflow should store its log files. 231 | # This path must be absolute. 232 | # There are a few existing configurations that assume this is set to the default. 233 | # If you choose to override this you may need to update the dag_processor_manager_log_location and 234 | # dag_processor_manager_log_location settings as well. 235 | base_log_folder = /opt/airflow/logs 236 | 237 | # Airflow can store logs remotely in AWS S3, Google Cloud Storage or Elastic Search. 238 | # Set this to True if you want to enable remote logging. 239 | remote_logging = False 240 | 241 | # Users must supply an Airflow connection id that provides access to the storage 242 | # location. 243 | remote_log_conn_id = 244 | 245 | # Path to Google Credential JSON file. If omitted, authorization based on `the Application Default 246 | # Credentials 247 | # `__ will 248 | # be used. 249 | google_key_path = 250 | 251 | # Storage bucket URL for remote logging 252 | # S3 buckets should start with "s3://" 253 | # Cloudwatch log groups should start with "cloudwatch://" 254 | # GCS buckets should start with "gs://" 255 | # WASB buckets should start with "wasb" just to help Airflow select correct handler 256 | # Stackdriver logs should start with "stackdriver://" 257 | remote_base_log_folder = 258 | 259 | # Use server-side encryption for logs stored in S3 260 | encrypt_s3_logs = False 261 | 262 | # Logging level. 263 | # 264 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. 265 | logging_level = INFO 266 | 267 | # Logging level for Flask-appbuilder UI. 268 | # 269 | # Supported values: ``CRITICAL``, ``ERROR``, ``WARNING``, ``INFO``, ``DEBUG``. 270 | fab_logging_level = WARNING 271 | 272 | # Logging class 273 | # Specify the class that will specify the logging configuration 274 | # This class has to be on the python classpath 275 | # Example: logging_config_class = my.path.default_local_settings.LOGGING_CONFIG 276 | logging_config_class = 277 | 278 | # Flag to enable/disable Colored logs in Console 279 | # Colour the logs when the controlling terminal is a TTY. 280 | colored_console_log = True 281 | 282 | # Log format for when Colored logs is enabled 283 | colored_log_format = [%%(blue)s%%(asctime)s%%(reset)s] {%%(blue)s%%(filename)s:%%(reset)s%%(lineno)d} %%(log_color)s%%(levelname)s%%(reset)s - %%(log_color)s%%(message)s%%(reset)s 284 | colored_formatter_class = airflow.utils.log.colored_log.CustomTTYColoredFormatter 285 | 286 | # Format of Log line 287 | log_format = [%%(asctime)s] {%%(filename)s:%%(lineno)d} %%(levelname)s - %%(message)s 288 | simple_log_format = %%(asctime)s %%(levelname)s - %%(message)s 289 | 290 | # Specify prefix pattern like mentioned below with stream handler TaskHandlerWithCustomFormatter 291 | # Example: task_log_prefix_template = {ti.dag_id}-{ti.task_id}-{execution_date}-{try_number} 292 | task_log_prefix_template = 293 | 294 | # Formatting for how airflow generates file names/paths for each task run. 295 | log_filename_template = {{ ti.dag_id }}/{{ ti.task_id }}/{{ ts }}/{{ try_number }}.log 296 | 297 | # Formatting for how airflow generates file names for log 298 | log_processor_filename_template = {{ filename }}.log 299 | 300 | # Full path of dag_processor_manager logfile. 301 | dag_processor_manager_log_location = /opt/airflow/logs/dag_processor_manager/dag_processor_manager.log 302 | 303 | # Name of handler to read task instance logs. 304 | # Defaults to use ``task`` handler. 305 | task_log_reader = task 306 | 307 | # A comma\-separated list of third-party logger names that will be configured to print messages to 308 | # consoles\. 309 | # Example: extra_logger_names = connexion,sqlalchemy 310 | extra_logger_names = 311 | 312 | # When you start an airflow worker, airflow starts a tiny web server 313 | # subprocess to serve the workers local log files to the airflow main 314 | # web server, who then builds pages and sends them to users. This defines 315 | # the port on which the logs are served. It needs to be unused, and open 316 | # visible from the main web server to connect into the workers. 317 | worker_log_server_port = 8793 318 | 319 | [metrics] 320 | 321 | # StatsD (https://github.com/etsy/statsd) integration settings. 322 | # Enables sending metrics to StatsD. 323 | statsd_on = False 324 | statsd_host = localhost 325 | statsd_port = 8125 326 | statsd_prefix = airflow 327 | 328 | # If you want to avoid sending all the available metrics to StatsD, 329 | # you can configure an allow list of prefixes (comma separated) to send only the metrics that 330 | # start with the elements of the list (e.g: "scheduler,executor,dagrun") 331 | statsd_allow_list = 332 | 333 | # A function that validate the statsd stat name, apply changes to the stat name if necessary and return 334 | # the transformed stat name. 335 | # 336 | # The function should have the following signature: 337 | # def func_name(stat_name: str) -> str: 338 | stat_name_handler = 339 | 340 | # To enable datadog integration to send airflow metrics. 341 | statsd_datadog_enabled = False 342 | 343 | # List of datadog tags attached to all metrics(e.g: key1:value1,key2:value2) 344 | statsd_datadog_tags = 345 | 346 | # If you want to utilise your own custom Statsd client set the relevant 347 | # module path below. 348 | # Note: The module path must exist on your PYTHONPATH for Airflow to pick it up 349 | # statsd_custom_client_path = 350 | 351 | [secrets] 352 | # Full class name of secrets backend to enable (will precede env vars and metastore in search path) 353 | # Example: backend = airflow.providers.amazon.aws.secrets.systems_manager.SystemsManagerParameterStoreBackend 354 | backend = 355 | 356 | # The backend_kwargs param is loaded into a dictionary and passed to __init__ of secrets backend class. 357 | # See documentation for the secrets backend you are using. JSON is expected. 358 | # Example for AWS Systems Manager ParameterStore: 359 | # ``{"connections_prefix": "/airflow/connections", "profile_name": "default"}`` 360 | backend_kwargs = 361 | 362 | [cli] 363 | # In what way should the cli access the API. The LocalClient will use the 364 | # database directly, while the json_client will use the api running on the 365 | # webserver 366 | api_client = airflow.api.client.local_client 367 | 368 | # If you set web_server_url_prefix, do NOT forget to append it here, ex: 369 | # ``endpoint_url = http://localhost:8080/myroot`` 370 | # So api will look like: ``http://localhost:8080/myroot/api/experimental/...`` 371 | endpoint_url = http://localhost:8080 372 | 373 | [debug] 374 | # Used only with ``DebugExecutor``. If set to ``True`` DAG will fail with first 375 | # failed task. Helpful for debugging purposes. 376 | fail_fast = False 377 | 378 | [api] 379 | # Enables the deprecated experimental API. Please note that these APIs do not have access control. 380 | # The authenticated user has full access. 381 | # 382 | # .. warning:: 383 | # 384 | # This `Experimental REST API `__ is 385 | # deprecated since version 2.0. Please consider using 386 | # `the Stable REST API `__. 387 | # For more information on migration, see 388 | # `UPDATING.md `_ 389 | enable_experimental_api = False 390 | 391 | # How to authenticate users of the API. See 392 | # https://airflow.apache.org/docs/apache-airflow/stable/security.html for possible values. 393 | # ("airflow.api.auth.backend.default" allows all requests for historic reasons) 394 | auth_backend = airflow.api.auth.backend.deny_all 395 | 396 | # Used to set the maximum page limit for API requests 397 | maximum_page_limit = 100 398 | 399 | # Used to set the default page limit when limit is zero. A default limit 400 | # of 100 is set on OpenApi spec. However, this particular default limit 401 | # only work when limit is set equal to zero(0) from API requests. 402 | # If no limit is supplied, the OpenApi spec default is used. 403 | fallback_page_limit = 100 404 | 405 | # The intended audience for JWT token credentials used for authorization. This value must match on the client and server sides. If empty, audience will not be tested. 406 | # Example: google_oauth2_audience = project-id-random-value.apps.googleusercontent.com 407 | google_oauth2_audience = 408 | 409 | # Path to Google Cloud Service Account key file (JSON). If omitted, authorization based on 410 | # `the Application Default Credentials 411 | # `__ will 412 | # be used. 413 | # Example: google_key_path = /files/service-account-json 414 | google_key_path = 415 | 416 | # Used in response to a preflight request to indicate which HTTP 417 | # headers can be used when making the actual request. This header is 418 | # the server side response to the browser's 419 | # Access-Control-Request-Headers header. 420 | access_control_allow_headers = 421 | 422 | # Specifies the method or methods allowed when accessing the resource. 423 | access_control_allow_methods = 424 | 425 | # Indicates whether the response can be shared with requesting code from the given origins. 426 | # Separate URLs with space. 427 | access_control_allow_origins = 428 | 429 | [lineage] 430 | # what lineage backend to use 431 | backend = 432 | 433 | [atlas] 434 | sasl_enabled = False 435 | host = 436 | port = 21000 437 | username = 438 | password = 439 | 440 | [operators] 441 | # The default owner assigned to each new operator, unless 442 | # provided explicitly or passed via ``default_args`` 443 | default_owner = airflow 444 | default_cpus = 1 445 | default_ram = 512 446 | default_disk = 512 447 | default_gpus = 0 448 | 449 | # Default queue that tasks get assigned to and that worker listen on. 450 | default_queue = default 451 | 452 | # Is allowed to pass additional/unused arguments (args, kwargs) to the BaseOperator operator. 453 | # If set to False, an exception will be thrown, otherwise only the console message will be displayed. 454 | allow_illegal_arguments = False 455 | 456 | [hive] 457 | # Default mapreduce queue for HiveOperator tasks 458 | default_hive_mapred_queue = 459 | 460 | # Template for mapred_job_name in HiveOperator, supports the following named parameters 461 | # hostname, dag_id, task_id, execution_date 462 | # mapred_job_name_template = 463 | 464 | [webserver] 465 | # The base url of your website as airflow cannot guess what domain or 466 | # cname you are using. This is used in automated emails that 467 | # airflow sends to point links to the right web server 468 | base_url = http://localhost:8080 469 | 470 | # Default timezone to display all dates in the UI, can be UTC, system, or 471 | # any IANA timezone string (e.g. Europe/Amsterdam). If left empty the 472 | # default value of core/default_timezone will be used 473 | # Example: default_ui_timezone = America/New_York 474 | default_ui_timezone = UTC 475 | 476 | # The ip specified when starting the web server 477 | web_server_host = 0.0.0.0 478 | 479 | # The port on which to run the web server 480 | web_server_port = 8080 481 | 482 | # Paths to the SSL certificate and key for the web server. When both are 483 | # provided SSL will be enabled. This does not change the web server port. 484 | web_server_ssl_cert = 485 | 486 | # Paths to the SSL certificate and key for the web server. When both are 487 | # provided SSL will be enabled. This does not change the web server port. 488 | web_server_ssl_key = 489 | 490 | # Number of seconds the webserver waits before killing gunicorn master that doesn't respond 491 | web_server_master_timeout = 120 492 | 493 | # Number of seconds the gunicorn webserver waits before timing out on a worker 494 | web_server_worker_timeout = 120 495 | 496 | # Number of workers to refresh at a time. When set to 0, worker refresh is 497 | # disabled. When nonzero, airflow periodically refreshes webserver workers by 498 | # bringing up new ones and killing old ones. 499 | worker_refresh_batch_size = 1 500 | 501 | # Number of seconds to wait before refreshing a batch of workers. 502 | worker_refresh_interval = 6000 503 | 504 | # If set to True, Airflow will track files in plugins_folder directory. When it detects changes, 505 | # then reload the gunicorn. 506 | reload_on_plugin_change = False 507 | 508 | # Secret key used to run your flask app. It should be as random as possible. However, when running 509 | # more than 1 instances of webserver, make sure all of them use the same ``secret_key`` otherwise 510 | # one of them will error with "CSRF session token is missing". 511 | secret_key = 5/xGOvm/Lhk5L2QuyF0B1Q== 512 | 513 | # Number of workers to run the Gunicorn web server 514 | workers = 4 515 | 516 | # The worker class gunicorn should use. Choices include 517 | # sync (default), eventlet, gevent 518 | worker_class = sync 519 | 520 | # Log files for the gunicorn webserver. '-' means log to stderr. 521 | access_logfile = - 522 | 523 | # Log files for the gunicorn webserver. '-' means log to stderr. 524 | error_logfile = - 525 | 526 | # Access log format for gunicorn webserver. 527 | # default format is %%(h)s %%(l)s %%(u)s %%(t)s "%%(r)s" %%(s)s %%(b)s "%%(f)s" "%%(a)s" 528 | # documentation - https://docs.gunicorn.org/en/stable/settings.html#access-log-format 529 | access_logformat = 530 | 531 | # Expose the configuration file in the web server 532 | expose_config = False 533 | 534 | # Expose hostname in the web server 535 | expose_hostname = True 536 | 537 | # Expose stacktrace in the web server 538 | expose_stacktrace = True 539 | 540 | # Default DAG view. Valid values are: ``tree``, ``graph``, ``duration``, ``gantt``, ``landing_times`` 541 | dag_default_view = tree 542 | 543 | # Default DAG orientation. Valid values are: 544 | # ``LR`` (Left->Right), ``TB`` (Top->Bottom), ``RL`` (Right->Left), ``BT`` (Bottom->Top) 545 | dag_orientation = LR 546 | 547 | # The amount of time (in secs) webserver will wait for initial handshake 548 | # while fetching logs from other worker machine 549 | log_fetch_timeout_sec = 5 550 | 551 | # Time interval (in secs) to wait before next log fetching. 552 | log_fetch_delay_sec = 2 553 | 554 | # Distance away from page bottom to enable auto tailing. 555 | log_auto_tailing_offset = 30 556 | 557 | # Animation speed for auto tailing log display. 558 | log_animation_speed = 1000 559 | 560 | # By default, the webserver shows paused DAGs. Flip this to hide paused 561 | # DAGs by default 562 | hide_paused_dags_by_default = False 563 | 564 | # Consistent page size across all listing views in the UI 565 | page_size = 100 566 | 567 | # Define the color of navigation bar 568 | navbar_color = #fff 569 | 570 | # Default dagrun to show in UI 571 | default_dag_run_display_number = 25 572 | 573 | # Enable werkzeug ``ProxyFix`` middleware for reverse proxy 574 | enable_proxy_fix = False 575 | 576 | # Number of values to trust for ``X-Forwarded-For``. 577 | # More info: https://werkzeug.palletsprojects.com/en/0.16.x/middleware/proxy_fix/ 578 | proxy_fix_x_for = 1 579 | 580 | # Number of values to trust for ``X-Forwarded-Proto`` 581 | proxy_fix_x_proto = 1 582 | 583 | # Number of values to trust for ``X-Forwarded-Host`` 584 | proxy_fix_x_host = 1 585 | 586 | # Number of values to trust for ``X-Forwarded-Port`` 587 | proxy_fix_x_port = 1 588 | 589 | # Number of values to trust for ``X-Forwarded-Prefix`` 590 | proxy_fix_x_prefix = 1 591 | 592 | # Set secure flag on session cookie 593 | cookie_secure = False 594 | 595 | # Set samesite policy on session cookie 596 | cookie_samesite = Lax 597 | 598 | # Default setting for wrap toggle on DAG code and TI log views. 599 | default_wrap = False 600 | 601 | # Allow the UI to be rendered in a frame 602 | x_frame_enabled = True 603 | 604 | # Send anonymous user activity to your analytics tool 605 | # choose from google_analytics, segment, or metarouter 606 | # analytics_tool = 607 | 608 | # Unique ID of your account in the analytics tool 609 | # analytics_id = 610 | 611 | # 'Recent Tasks' stats will show for old DagRuns if set 612 | show_recent_stats_for_completed_runs = True 613 | 614 | # Update FAB permissions and sync security manager roles 615 | # on webserver startup 616 | update_fab_perms = True 617 | 618 | # The UI cookie lifetime in minutes. User will be logged out from UI after 619 | # ``session_lifetime_minutes`` of non-activity 620 | session_lifetime_minutes = 43200 621 | 622 | # Sets a custom page title for the DAGs overview page and site title for all pages 623 | # instance_name = 624 | 625 | # How frequently, in seconds, the DAG data will auto-refresh in graph or tree view 626 | # when auto-refresh is turned on 627 | auto_refresh_interval = 3 628 | 629 | [email] 630 | 631 | # Configuration email backend and whether to 632 | # send email alerts on retry or failure 633 | # Email backend to use 634 | email_backend = airflow.utils.email.send_email_smtp 635 | 636 | # Email connection to use 637 | #email_conn_id = smtp_default 638 | 639 | # Whether email alerts should be sent when a task is retried 640 | #default_email_on_retry = True 641 | 642 | # Whether email alerts should be sent when a task failed 643 | #default_email_on_failure = True 644 | 645 | # File that will be used as the template for Email subject (which will be rendered using Jinja2). 646 | # If not set, Airflow uses a base template. 647 | # Example: subject_template = /path/to/my_subject_template_file 648 | # subject_template = 649 | 650 | # File that will be used as the template for Email content (which will be rendered using Jinja2). 651 | # If not set, Airflow uses a base template. 652 | # Example: html_content_template = /path/to/my_html_content_template_file 653 | # html_content_template = 654 | 655 | [smtp] 656 | 657 | # If you want airflow to send emails on retries, failure, and you want to use 658 | # the airflow.utils.email.send_email_smtp function, you have to configure an 659 | # smtp server here 660 | smtp_host = 192.168.1.39 661 | smtp_starttls = False 662 | smtp_ssl = False 663 | # Example: smtp_user = airflow 664 | smtp_user = airflow@localmail.com 665 | # Example: smtp_password = airflow 666 | smtp_password = demopass 667 | smtp_port = 25 668 | smtp_mail_from = admin@localmail.com 669 | smtp_timeout = 30 670 | smtp_retry_limit = 5 671 | 672 | [sentry] 673 | 674 | # Sentry (https://docs.sentry.io) integration. Here you can supply 675 | # additional configuration options based on the Python platform. See: 676 | # https://docs.sentry.io/error-reporting/configuration/?platform=python. 677 | # Unsupported options: ``integrations``, ``in_app_include``, ``in_app_exclude``, 678 | # ``ignore_errors``, ``before_breadcrumb``, ``transport``. 679 | # Enable error reporting to Sentry 680 | sentry_on = false 681 | sentry_dsn = 682 | 683 | # Dotted path to a before_send function that the sentry SDK should be configured to use. 684 | # before_send = 685 | 686 | [celery_kubernetes_executor] 687 | 688 | # This section only applies if you are using the ``CeleryKubernetesExecutor`` in 689 | # ``[core]`` section above 690 | # Define when to send a task to ``KubernetesExecutor`` when using ``CeleryKubernetesExecutor``. 691 | # When the queue of a task is the value of ``kubernetes_queue`` (default ``kubernetes``), 692 | # the task is executed via ``KubernetesExecutor``, 693 | # otherwise via ``CeleryExecutor`` 694 | kubernetes_queue = kubernetes 695 | 696 | [celery] 697 | 698 | # This section only applies if you are using the CeleryExecutor in 699 | # ``[core]`` section above 700 | # The app name that will be used by celery 701 | celery_app_name = airflow.executors.celery_executor 702 | 703 | # The concurrency that will be used when starting workers with the 704 | # ``airflow celery worker`` command. This defines the number of task instances that 705 | # a worker will take, so size up your workers based on the resources on 706 | # your worker box and the nature of your tasks 707 | worker_concurrency = 16 708 | 709 | # The maximum and minimum concurrency that will be used when starting workers with the 710 | # ``airflow celery worker`` command (always keep minimum processes, but grow 711 | # to maximum if necessary). Note the value should be max_concurrency,min_concurrency 712 | # Pick these numbers based on resources on worker box and the nature of the task. 713 | # If autoscale option is available, worker_concurrency will be ignored. 714 | # http://docs.celeryproject.org/en/latest/reference/celery.bin.worker.html#cmdoption-celery-worker-autoscale 715 | # Example: worker_autoscale = 16,12 716 | # worker_autoscale = 717 | 718 | # Used to increase the number of tasks that a worker prefetches which can improve performance. 719 | # The number of processes multiplied by worker_prefetch_multiplier is the number of tasks 720 | # that are prefetched by a worker. A value greater than 1 can result in tasks being unnecessarily 721 | # blocked if there are multiple workers and one worker prefetches tasks that sit behind long 722 | # running tasks while another worker has unutilized processes that are unable to process the already 723 | # claimed blocked tasks. 724 | # https://docs.celeryproject.org/en/stable/userguide/optimizing.html#prefetch-limits 725 | # Example: worker_prefetch_multiplier = 1 726 | # worker_prefetch_multiplier = 727 | 728 | # Umask that will be used when starting workers with the ``airflow celery worker`` 729 | # in daemon mode. This control the file-creation mode mask which determines the initial 730 | # value of file permission bits for newly created files. 731 | worker_umask = 0o077 732 | 733 | # The Celery broker URL. Celery supports RabbitMQ, Redis and experimentally 734 | # a sqlalchemy database. Refer to the Celery documentation for more information. 735 | broker_url = redis://redis:6379/0 736 | 737 | # The Celery result_backend. When a job finishes, it needs to update the 738 | # metadata of the job. Therefore it will post a message on a message bus, 739 | # or insert it into a database (depending of the backend) 740 | # This status is used by the scheduler to update the state of the task 741 | # The use of a database is highly recommended 742 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#task-result-backend-settings 743 | result_backend = db+postgresql://postgres:airflow@postgres/airflow 744 | 745 | # Celery Flower is a sweet UI for Celery. Airflow has a shortcut to start 746 | # it ``airflow celery flower``. This defines the IP that Celery Flower runs on 747 | flower_host = 0.0.0.0 748 | 749 | # The root URL for Flower 750 | # Example: flower_url_prefix = /flower 751 | flower_url_prefix = 752 | 753 | # This defines the port that Celery Flower runs on 754 | flower_port = 5555 755 | 756 | # Securing Flower with Basic Authentication 757 | # Accepts user:password pairs separated by a comma 758 | # Example: flower_basic_auth = user1:password1,user2:password2 759 | flower_basic_auth = 760 | 761 | # How many processes CeleryExecutor uses to sync task state. 762 | # 0 means to use max(1, number of cores - 1) processes. 763 | sync_parallelism = 0 764 | 765 | # Import path for celery configuration options 766 | celery_config_options = airflow.config_templates.default_celery.DEFAULT_CELERY_CONFIG 767 | ssl_active = False 768 | ssl_key = 769 | ssl_cert = 770 | ssl_cacert = 771 | 772 | # Celery Pool implementation. 773 | # Choices include: ``prefork`` (default), ``eventlet``, ``gevent`` or ``solo``. 774 | # See: 775 | # https://docs.celeryproject.org/en/latest/userguide/workers.html#concurrency 776 | # https://docs.celeryproject.org/en/latest/userguide/concurrency/eventlet.html 777 | pool = prefork 778 | 779 | # The number of seconds to wait before timing out ``send_task_to_executor`` or 780 | # ``fetch_celery_task_state`` operations. 781 | operation_timeout = 1.0 782 | 783 | # Celery task will report its status as 'started' when the task is executed by a worker. 784 | # This is used in Airflow to keep track of the running tasks and if a Scheduler is restarted 785 | # or run in HA mode, it can adopt the orphan tasks launched by previous SchedulerJob. 786 | task_track_started = True 787 | 788 | # Time in seconds after which Adopted tasks are cleared by CeleryExecutor. This is helpful to clear 789 | # stalled tasks. 790 | task_adoption_timeout = 600 791 | 792 | # The Maximum number of retries for publishing task messages to the broker when failing 793 | # due to ``AirflowTaskTimeout`` error before giving up and marking Task as failed. 794 | task_publish_max_retries = 3 795 | 796 | # Worker initialisation check to validate Metadata Database connection 797 | worker_precheck = False 798 | 799 | [celery_broker_transport_options] 800 | 801 | # This section is for specifying options which can be passed to the 802 | # underlying celery broker transport. See: 803 | # http://docs.celeryproject.org/en/latest/userguide/configuration.html#std:setting-broker_transport_options 804 | # The visibility timeout defines the number of seconds to wait for the worker 805 | # to acknowledge the task before the message is redelivered to another worker. 806 | # Make sure to increase the visibility timeout to match the time of the longest 807 | # ETA you're planning to use. 808 | # visibility_timeout is only supported for Redis and SQS celery brokers. 809 | # See: 810 | # http://docs.celeryproject.org/en/master/userguide/configuration.html#std:setting-broker_transport_options 811 | # Example: visibility_timeout = 21600 812 | # visibility_timeout = 813 | 814 | [dask] 815 | 816 | # This section only applies if you are using the DaskExecutor in 817 | # [core] section above 818 | # The IP address and port of the Dask cluster's scheduler. 819 | cluster_address = 127.0.0.1:8786 820 | 821 | # TLS/ SSL settings to access a secured Dask scheduler. 822 | tls_ca = 823 | tls_cert = 824 | tls_key = 825 | 826 | [scheduler] 827 | # Task instances listen for external kill signal (when you clear tasks 828 | # from the CLI or the UI), this defines the frequency at which they should 829 | # listen (in seconds). 830 | job_heartbeat_sec = 5 831 | 832 | # The scheduler constantly tries to trigger new tasks (look at the 833 | # scheduler section in the docs for more information). This defines 834 | # how often the scheduler should run (in seconds). 835 | scheduler_heartbeat_sec = 5 836 | 837 | # The number of times to try to schedule each DAG file 838 | # -1 indicates unlimited number 839 | num_runs = -1 840 | 841 | # Controls how long the scheduler will sleep between loops, but if there was nothing to do 842 | # in the loop. i.e. if it scheduled something then it will start the next loop 843 | # iteration straight away. 844 | scheduler_idle_sleep_time = 1 845 | 846 | # Number of seconds after which a DAG file is parsed. The DAG file is parsed every 847 | # ``min_file_process_interval`` number of seconds. Updates to DAGs are reflected after 848 | # this interval. Keeping this number low will increase CPU usage. 849 | min_file_process_interval = 30 850 | 851 | # How often (in seconds) to scan the DAGs directory for new files. Default to 5 minutes. 852 | dag_dir_list_interval = 300 853 | 854 | # How often should stats be printed to the logs. Setting to 0 will disable printing stats 855 | print_stats_interval = 30 856 | 857 | # How often (in seconds) should pool usage stats be sent to statsd (if statsd_on is enabled) 858 | pool_metrics_interval = 5.0 859 | 860 | # If the last scheduler heartbeat happened more than scheduler_health_check_threshold 861 | # ago (in seconds), scheduler is considered unhealthy. 862 | # This is used by the health check in the "/health" endpoint 863 | scheduler_health_check_threshold = 30 864 | 865 | # How often (in seconds) should the scheduler check for orphaned tasks and SchedulerJobs 866 | orphaned_tasks_check_interval = 300.0 867 | child_process_log_directory = /opt/airflow/logs/scheduler 868 | 869 | # Local task jobs periodically heartbeat to the DB. If the job has 870 | # not heartbeat in this many seconds, the scheduler will mark the 871 | # associated task instance as failed and will re-schedule the task. 872 | scheduler_zombie_task_threshold = 300 873 | 874 | # Turn off scheduler catchup by setting this to ``False``. 875 | # Default behavior is unchanged and 876 | # Command Line Backfills still work, but the scheduler 877 | # will not do scheduler catchup if this is ``False``, 878 | # however it can be set on a per DAG basis in the 879 | # DAG definition (catchup) 880 | catchup_by_default = True 881 | 882 | # This changes the batch size of queries in the scheduling main loop. 883 | # If this is too high, SQL query performance may be impacted by 884 | # complexity of query predicate, and/or excessive locking. 885 | # Additionally, you may hit the maximum allowable query length for your db. 886 | # Set this to 0 for no limit (not advised) 887 | max_tis_per_query = 512 888 | 889 | # Should the scheduler issue ``SELECT ... FOR UPDATE`` in relevant queries. 890 | # If this is set to False then you should not run more than a single 891 | # scheduler at once 892 | use_row_level_locking = True 893 | 894 | # Max number of DAGs to create DagRuns for per scheduler loop. 895 | max_dagruns_to_create_per_loop = 10 896 | 897 | # How many DagRuns should a scheduler examine (and lock) when scheduling 898 | # and queuing tasks. 899 | max_dagruns_per_loop_to_schedule = 20 900 | 901 | # Should the Task supervisor process perform a "mini scheduler" to attempt to schedule more tasks of the 902 | # same DAG. Leaving this on will mean tasks in the same DAG execute quicker, but might starve out other 903 | # dags in some circumstances 904 | schedule_after_task_execution = True 905 | 906 | # The scheduler can run multiple processes in parallel to parse dags. 907 | # This defines how many processes will run. 908 | parsing_processes = 2 909 | 910 | # One of ``modified_time``, ``random_seeded_by_host`` and ``alphabetical``. 911 | # The scheduler will list and sort the dag files to decide the parsing order. 912 | # 913 | # * ``modified_time``: Sort by modified time of the files. This is useful on large scale to parse the 914 | # recently modified DAGs first. 915 | # * ``random_seeded_by_host``: Sort randomly across multiple Schedulers but with same order on the 916 | # same host. This is useful when running with Scheduler in HA mode where each scheduler can 917 | # parse different DAG files. 918 | # * ``alphabetical``: Sort by filename 919 | file_parsing_sort_mode = modified_time 920 | 921 | # Turn off scheduler use of cron intervals by setting this to False. 922 | # DAGs submitted manually in the web UI or with trigger_dag will still run. 923 | use_job_schedule = True 924 | 925 | # Allow externally triggered DagRuns for Execution Dates in the future 926 | # Only has effect if schedule_interval is set to None in DAG 927 | allow_trigger_in_future = False 928 | 929 | # DAG dependency detector class to use 930 | dependency_detector = airflow.serialization.serialized_objects.DependencyDetector 931 | 932 | # How often to check for expired trigger requests that have not run yet. 933 | trigger_timeout_check_interval = 15 934 | 935 | [triggerer] 936 | # How many triggers a single Triggerer will run at once, by default. 937 | default_capacity = 1000 938 | 939 | [kerberos] 940 | ccache = /tmp/airflow_krb5_ccache 941 | 942 | # gets augmented with fqdn 943 | principal = airflow 944 | reinit_frequency = 3600 945 | kinit_path = kinit 946 | keytab = airflow.keytab 947 | 948 | # Allow to disable ticket forwardability. 949 | forwardable = True 950 | 951 | # Allow to remove source IP from token, useful when using token behind NATted Docker host. 952 | include_ip = True 953 | 954 | [github_enterprise] 955 | api_rev = v3 956 | 957 | [elasticsearch] 958 | # Elasticsearch host 959 | host = 960 | 961 | # Format of the log_id, which is used to query for a given tasks logs 962 | log_id_template = {dag_id}-{task_id}-{execution_date}-{try_number} 963 | 964 | # Used to mark the end of a log stream for a task 965 | end_of_log_mark = end_of_log 966 | 967 | # Qualified URL for an elasticsearch frontend (like Kibana) with a template argument for log_id 968 | # Code will construct log_id using the log_id template from the argument above. 969 | # NOTE: scheme will default to https if one is not provided 970 | # Example: frontend = http://localhost:5601/app/kibana#/discover?_a=(columns:!(message),query:(language:kuery,query:'log_id: "{log_id}"'),sort:!(log.offset,asc)) 971 | frontend = 972 | 973 | # Write the task logs to the stdout of the worker, rather than the default files 974 | write_stdout = False 975 | 976 | # Instead of the default log formatter, write the log lines as JSON 977 | json_format = False 978 | 979 | # Log fields to also attach to the json output, if enabled 980 | json_fields = asctime, filename, lineno, levelname, message 981 | 982 | # The field where host name is stored (normally either `host` or `host.name`) 983 | host_field = host 984 | 985 | # The field where offset is stored (normally either `offset` or `log.offset`) 986 | offset_field = offset 987 | 988 | [elasticsearch_configs] 989 | use_ssl = False 990 | verify_certs = True 991 | 992 | [kubernetes] 993 | # Path to the YAML pod file that forms the basis for KubernetesExecutor workers. 994 | pod_template_file = 995 | 996 | # The repository of the Kubernetes Image for the Worker to Run 997 | worker_container_repository = 998 | 999 | # The tag of the Kubernetes Image for the Worker to Run 1000 | worker_container_tag = 1001 | 1002 | # The Kubernetes namespace where airflow workers should be created. Defaults to ``default`` 1003 | namespace = default 1004 | 1005 | # If True, all worker pods will be deleted upon termination 1006 | delete_worker_pods = True 1007 | 1008 | # If False (and delete_worker_pods is True), 1009 | # failed worker pods will not be deleted so users can investigate them. 1010 | # This only prevents removal of worker pods where the worker itself failed, 1011 | # not when the task it ran failed. 1012 | delete_worker_pods_on_failure = False 1013 | 1014 | # Number of Kubernetes Worker Pod creation calls per scheduler loop. 1015 | # Note that the current default of "1" will only launch a single pod 1016 | # per-heartbeat. It is HIGHLY recommended that users increase this 1017 | # number to match the tolerance of their kubernetes cluster for 1018 | # better performance. 1019 | worker_pods_creation_batch_size = 1 1020 | 1021 | # Allows users to launch pods in multiple namespaces. 1022 | # Will require creating a cluster-role for the scheduler 1023 | multi_namespace_mode = False 1024 | 1025 | # Use the service account kubernetes gives to pods to connect to kubernetes cluster. 1026 | # It's intended for clients that expect to be running inside a pod running on kubernetes. 1027 | # It will raise an exception if called from a process not running in a kubernetes environment. 1028 | in_cluster = True 1029 | 1030 | # When running with in_cluster=False change the default cluster_context or config_file 1031 | # options to Kubernetes client. Leave blank these to use default behaviour like ``kubectl`` has. 1032 | # cluster_context = 1033 | 1034 | # Path to the kubernetes configfile to be used when ``in_cluster`` is set to False 1035 | # config_file = 1036 | 1037 | # Keyword parameters to pass while calling a kubernetes client core_v1_api methods 1038 | # from Kubernetes Executor provided as a single line formatted JSON dictionary string. 1039 | # List of supported params are similar for all core_v1_apis, hence a single config 1040 | # variable for all apis. See: 1041 | # https://raw.githubusercontent.com/kubernetes-client/python/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/api/core_v1_api.py 1042 | kube_client_request_args = 1043 | 1044 | # Optional keyword arguments to pass to the ``delete_namespaced_pod`` kubernetes client 1045 | # ``core_v1_api`` method when using the Kubernetes Executor. 1046 | # This should be an object and can contain any of the options listed in the ``v1DeleteOptions`` 1047 | # class defined here: 1048 | # https://github.com/kubernetes-client/python/blob/41f11a09995efcd0142e25946adc7591431bfb2f/kubernetes/client/models/v1_delete_options.py#L19 1049 | # Example: delete_option_kwargs = {"grace_period_seconds": 10} 1050 | delete_option_kwargs = 1051 | 1052 | # Enables TCP keepalive mechanism. This prevents Kubernetes API requests to hang indefinitely 1053 | # when idle connection is time-outed on services like cloud load balancers or firewalls. 1054 | enable_tcp_keepalive = True 1055 | 1056 | # When the `enable_tcp_keepalive` option is enabled, TCP probes a connection that has 1057 | # been idle for `tcp_keep_idle` seconds. 1058 | tcp_keep_idle = 120 1059 | 1060 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 1061 | # to a keepalive probe, TCP retransmits the probe after `tcp_keep_intvl` seconds. 1062 | tcp_keep_intvl = 30 1063 | 1064 | # When the `enable_tcp_keepalive` option is enabled, if Kubernetes API does not respond 1065 | # to a keepalive probe, TCP retransmits the probe `tcp_keep_cnt number` of times before 1066 | # a connection is considered to be broken. 1067 | tcp_keep_cnt = 6 1068 | 1069 | # Set this to false to skip verifying SSL certificate of Kubernetes python client. 1070 | verify_ssl = True 1071 | 1072 | # How long in seconds a worker can be in Pending before it is considered a failure 1073 | worker_pods_pending_timeout = 300 1074 | 1075 | # How often in seconds to check if Pending workers have exceeded their timeouts 1076 | worker_pods_pending_timeout_check_interval = 120 1077 | 1078 | # How often in seconds to check for task instances stuck in "queued" status without a pod 1079 | worker_pods_queued_check_interval = 60 1080 | 1081 | # How many pending pods to check for timeout violations in each check interval. 1082 | # You may want this higher if you have a very large cluster and/or use ``multi_namespace_mode``. 1083 | worker_pods_pending_timeout_batch_size = 100 1084 | 1085 | [smart_sensor] 1086 | # When `use_smart_sensor` is True, Airflow redirects multiple qualified sensor tasks to 1087 | # smart sensor task. 1088 | use_smart_sensor = False 1089 | 1090 | # `shard_code_upper_limit` is the upper limit of `shard_code` value. The `shard_code` is generated 1091 | # by `hashcode % shard_code_upper_limit`. 1092 | shard_code_upper_limit = 10000 1093 | 1094 | # The number of running smart sensor processes for each service. 1095 | shards = 5 1096 | 1097 | # comma separated sensor classes support in smart_sensor. 1098 | sensors_enabled = NamedHivePartitionSensor -------------------------------------------------------------------------------- /airflow-sql/dags/dbt_orchestration.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime 2 | from datetime import timedelta 3 | from pathlib import Path 4 | from airflow import DAG 5 | from dbt_airflow.core.config import DbtAirflowConfig 6 | from dbt_airflow.core.config import DbtProfileConfig 7 | from dbt_airflow.core.config import DbtProjectConfig 8 | from dbt_airflow.core.task_group import DbtTaskGroup 9 | from dbt_airflow.operators.execution import ExecutionOperator 10 | 11 | 12 | with DAG( 13 | dag_id='dbt_workflow_orchestration', 14 | start_date=datetime(2024, 10, 18), 15 | catchup=False, 16 | tags=['dbt dag'], 17 | default_args={ 18 | 'owner': 'airflow', 19 | 'retries': 1, 20 | 'retry_delay': timedelta(minutes=5), 21 | }, 22 | 23 | ) as dag: 24 | 25 | 26 | run_dbt = DbtTaskGroup( 27 | group_id='transform', 28 | dbt_project_config=DbtProjectConfig( 29 | project_path=Path('/opt/dbt'), 30 | manifest_path=Path('/opt/dbt/target/manifest.json'), 31 | ), 32 | dbt_profile_config=DbtProfileConfig( 33 | profiles_path=Path('/opt/dbt'), 34 | target='dev', 35 | ), 36 | dbt_airflow_config=DbtAirflowConfig( 37 | execution_operator=ExecutionOperator.BASH, 38 | ), 39 | ) 40 | 41 | run_dbt -------------------------------------------------------------------------------- /airflow-sql/docker-compose.yml: -------------------------------------------------------------------------------- 1 | # Licensed to the Apache Software Foundation (ASF) under one 2 | # or more contributor license agreements. See the NOTICE file 3 | # distributed with this work for additional information 4 | # regarding copyright ownership. The ASF licenses this file 5 | # to you under the Apache License, Version 2.0 (the 6 | # "License"); you may not use this file except in compliance 7 | # with the License. You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, 12 | # software distributed under the License is distributed on an 13 | # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY 14 | # KIND, either express or implied. See the License for the 15 | # specific language governing permissions and limitations 16 | # under the License. 17 | # 18 | 19 | # Basic Airflow cluster configuration for CeleryExecutor with Redis and PostgreSQL. 20 | # 21 | # WARNING: This configuration is for local development. Do not use it in a production deployment. 22 | # 23 | # This configuration supports basic configuration using environment variables or an .env file 24 | # The following variables are supported: 25 | # 26 | # AIRFLOW_IMAGE_NAME - Docker image name used to run Airflow. 27 | # Default: apache/airflow:2.5.3 28 | # AIRFLOW_UID - User ID in Airflow containers 29 | # Default: 50000 30 | # AIRFLOW_PROJ_DIR - Base path to which all the files will be volumed. 31 | # Default: . 32 | # Those configurations are useful mostly in case of standalone testing/running Airflow in test/try-out mode 33 | # 34 | # _AIRFLOW_WWW_USER_USERNAME - Username for the administrator account (if requested). 35 | # Default: airflow 36 | # _AIRFLOW_WWW_USER_PASSWORD - Password for the administrator account (if requested). 37 | # Default: airflow 38 | # _PIP_ADDITIONAL_REQUIREMENTS - Additional PIP requirements to add when starting all containers. 39 | # Use this option ONLY for quick checks. Installing requirements at container 40 | # startup is done EVERY TIME the service is started. 41 | # A better way is to build a custom image or extend the official image 42 | # as described in https://airflow.apache.org/docs/docker-stack/build.html. 43 | # Default: '' 44 | # 45 | # Feel free to modify this file to suit your needs. 46 | --- 47 | version: '3.8' 48 | x-airflow-common: 49 | &airflow-common 50 | # In order to add custom dependencies or upgrade provider packages you can use your extended image. 51 | # Comment the image line, place your Dockerfile in the directory where you placed the docker-compose.yaml 52 | # and uncomment the "build" line below, Then run `docker-compose build` to build the images. 53 | image: ${AIRFLOW_IMAGE_NAME:-airflow-sql} 54 | # build: . 55 | environment: 56 | &airflow-common-env 57 | AIRFLOW__CORE__EXECUTOR: CeleryExecutor 58 | AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 59 | # For backward compatibility, with Airflow <2.3 60 | #AIRFLOW__CORE__SQL_ALCHEMY_CONN: postgresql+psycopg2://airflow:airflow@postgres/airflow 61 | AIRFLOW__CELERY__RESULT_BACKEND: db+postgresql://airflow:airflow@postgres/airflow 62 | AIRFLOW__CELERY__BROKER_URL: redis://:@redis:6379/0 63 | AIRFLOW__CORE__FERNET_KEY: '' 64 | AIRFLOW__CORE__DAGS_ARE_PAUSED_AT_CREATION: 'true' 65 | AIRFLOW__CORE__LOAD_EXAMPLES: 'false' 66 | AIRFLOW__API__AUTH_BACKENDS: 'airflow.api.auth.backend.basic_auth,airflow.api.auth.backend.session' 67 | OPENLINEAGE_URL: 'http://192.168.1.39:5000' 68 | OPENLINEAGE_NAMESPACE: 'airflow' 69 | PGUID: 'etl' 70 | PGPASS: 'demopass' 71 | 72 | # yamllint disable rule:line-length 73 | # Use simple http server on scheduler for health checks 74 | # See https://airflow.apache.org/docs/apache-airflow/stable/administration-and-deployment/logging-monitoring/check-health.html#scheduler-health-check-server 75 | # yamllint enable rule:line-length 76 | AIRFLOW__SCHEDULER__ENABLE_HEALTH_CHECK: 'true' 77 | # WARNING: Use _PIP_ADDITIONAL_REQUIREMENTS option ONLY for a quick checks 78 | # for other purpose (development, test and especially production usage) build/extend Airflow image. 79 | _PIP_ADDITIONAL_REQUIREMENTS: ${_PIP_ADDITIONAL_REQUIREMENTS:-} 80 | volumes: 81 | - ./dags:/opt/airflow/dags 82 | - ./logs:/opt/airflow/logs 83 | - ./plugins:/opt/airflow/plugins 84 | - ./great_expectations:/opt/airflow/great_expectations 85 | - ./sql:/opt/sql 86 | - ./warehouse:/opt/dbt 87 | - ./profile:/opt/dbt/profiles 88 | - ./profile:/opt/airflow/.dbt/ 89 | - ./config/airflow.cfg:/opt/airflow/airflow.cfg 90 | user: "${AIRFLOW_UID:-50000}:0" 91 | depends_on: 92 | &airflow-common-depends-on 93 | redis: 94 | condition: service_healthy 95 | postgres: 96 | condition: service_healthy 97 | 98 | services: 99 | postgres: 100 | image: postgres:13 101 | environment: 102 | POSTGRES_USER: airflow 103 | POSTGRES_PASSWORD: airflow 104 | POSTGRES_DB: airflow 105 | volumes: 106 | - postgres-db-volume:/var/lib/postgresql/data 107 | healthcheck: 108 | test: ["CMD", "pg_isready", "-U", "airflow"] 109 | interval: 10s 110 | retries: 5 111 | start_period: 5s 112 | restart: always 113 | 114 | redis: 115 | image: redis:latest 116 | expose: 117 | - 6379 118 | healthcheck: 119 | test: ["CMD", "redis-cli", "ping"] 120 | interval: 10s 121 | timeout: 30s 122 | retries: 50 123 | start_period: 30s 124 | restart: always 125 | 126 | airflow-webserver: 127 | <<: *airflow-common 128 | command: webserver 129 | ports: 130 | - 9093:8080 131 | healthcheck: 132 | test: ["CMD", "curl", "--fail", "http://localhost:9093/health"] 133 | interval: 30s 134 | timeout: 10s 135 | retries: 5 136 | start_period: 30s 137 | restart: always 138 | depends_on: 139 | <<: *airflow-common-depends-on 140 | airflow-init: 141 | condition: service_completed_successfully 142 | 143 | airflow-scheduler: 144 | <<: *airflow-common 145 | command: scheduler 146 | healthcheck: 147 | test: ["CMD", "curl", "--fail", "http://localhost:8974/health"] 148 | interval: 30s 149 | timeout: 10s 150 | retries: 5 151 | start_period: 30s 152 | restart: always 153 | depends_on: 154 | <<: *airflow-common-depends-on 155 | airflow-init: 156 | condition: service_completed_successfully 157 | 158 | airflow-worker: 159 | <<: *airflow-common 160 | # start 161 | tmpfs: 162 | - '/opt/airflow/' 163 | command: celery worker 164 | healthcheck: 165 | test: 166 | - "CMD-SHELL" 167 | - 'celery --app airflow.executors.celery_executor.app inspect ping -d "celery@$${HOSTNAME}"' 168 | interval: 30s 169 | timeout: 10s 170 | retries: 5 171 | start_period: 30s 172 | environment: 173 | <<: *airflow-common-env 174 | # Required to handle warm shutdown of the celery workers properly 175 | # See https://airflow.apache.org/docs/docker-stack/entrypoint.html#signal-propagation 176 | DUMB_INIT_SETSID: "0" 177 | restart: always 178 | depends_on: 179 | <<: *airflow-common-depends-on 180 | airflow-init: 181 | condition: service_completed_successfully 182 | 183 | airflow-triggerer: 184 | <<: *airflow-common 185 | command: triggerer 186 | healthcheck: 187 | test: ["CMD-SHELL", 'airflow jobs check --job-type TriggererJob --hostname "$${HOSTNAME}"'] 188 | interval: 30s 189 | timeout: 10s 190 | retries: 5 191 | start_period: 30s 192 | restart: always 193 | depends_on: 194 | <<: *airflow-common-depends-on 195 | airflow-init: 196 | condition: service_completed_successfully 197 | 198 | airflow-init: 199 | <<: *airflow-common 200 | entrypoint: /bin/bash 201 | # yamllint disable rule:line-length 202 | command: 203 | - -c 204 | - | 205 | function ver() { 206 | printf "%04d%04d%04d%04d" $${1//./ } 207 | } 208 | airflow_version=$$(AIRFLOW__LOGGING__LOGGING_LEVEL=INFO && gosu airflow airflow version) 209 | airflow_version_comparable=$$(ver $${airflow_version}) 210 | min_airflow_version=2.9.0 211 | min_airflow_version_comparable=$$(ver $${min_airflow_version}) 212 | if (( airflow_version_comparable < min_airflow_version_comparable )); then 213 | echo 214 | echo -e "\033[1;31mERROR!!!: Too old Airflow version $${airflow_version}!\e[0m" 215 | echo "The minimum Airflow version supported: $${min_airflow_version}. Only use this or higher!" 216 | echo 217 | exit 1 218 | fi 219 | if [[ -z "${AIRFLOW_UID}" ]]; then 220 | echo 221 | echo -e "\033[1;33mWARNING!!!: AIRFLOW_UID not set!\e[0m" 222 | echo "If you are on Linux, you SHOULD follow the instructions below to set " 223 | echo "AIRFLOW_UID environment variable, otherwise files will be owned by root." 224 | echo "For other operating systems you can get rid of the warning with manually created .env file:" 225 | echo " See: https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#setting-the-right-airflow-user" 226 | echo 227 | fi 228 | one_meg=1048576 229 | mem_available=$$(($$(getconf _PHYS_PAGES) * $$(getconf PAGE_SIZE) / one_meg)) 230 | cpus_available=$$(grep -cE 'cpu[0-9]+' /proc/stat) 231 | disk_available=$$(df / | tail -1 | awk '{print $$4}') 232 | warning_resources="false" 233 | if (( mem_available < 4000 )) ; then 234 | echo 235 | echo -e "\033[1;33mWARNING!!!: Not enough memory available for Docker.\e[0m" 236 | echo "At least 4GB of memory required. You have $$(numfmt --to iec $$((mem_available * one_meg)))" 237 | echo 238 | warning_resources="true" 239 | fi 240 | if (( cpus_available < 2 )); then 241 | echo 242 | echo -e "\033[1;33mWARNING!!!: Not enough CPUS available for Docker.\e[0m" 243 | echo "At least 2 CPUs recommended. You have $${cpus_available}" 244 | echo 245 | warning_resources="true" 246 | fi 247 | if (( disk_available < one_meg * 10 )); then 248 | echo 249 | echo -e "\033[1;33mWARNING!!!: Not enough Disk space available for Docker.\e[0m" 250 | echo "At least 10 GBs recommended. You have $$(numfmt --to iec $$((disk_available * 1024 )))" 251 | echo 252 | warning_resources="true" 253 | fi 254 | if [[ $${warning_resources} == "true" ]]; then 255 | echo 256 | echo -e "\033[1;33mWARNING!!!: You have not enough resources to run Airflow (see above)!\e[0m" 257 | echo "Please follow the instructions to increase amount of resources available:" 258 | echo " https://airflow.apache.org/docs/apache-airflow/stable/howto/docker-compose/index.html#before-you-begin" 259 | echo 260 | fi 261 | mkdir -p /sources/logs /sources/dags /sources/plugins 262 | chown -R "${AIRFLOW_UID}:0" /sources/{logs,dags,plugins} 263 | exec /entrypoint airflow version 264 | # yamllint enable rule:line-length 265 | environment: 266 | <<: *airflow-common-env 267 | _AIRFLOW_DB_UPGRADE: 'true' 268 | _AIRFLOW_WWW_USER_CREATE: 'true' 269 | _AIRFLOW_WWW_USER_USERNAME: ${_AIRFLOW_WWW_USER_USERNAME:-airflow} 270 | _AIRFLOW_WWW_USER_PASSWORD: ${_AIRFLOW_WWW_USER_PASSWORD:-airflow} 271 | _PIP_ADDITIONAL_REQUIREMENTS: '' 272 | user: "0:0" 273 | volumes: 274 | - ${AIRFLOW_PROJ_DIR:-.}:/sources 275 | 276 | airflow-cli: 277 | <<: *airflow-common 278 | profiles: 279 | - debug 280 | environment: 281 | <<: *airflow-common-env 282 | CONNECTION_CHECK_MAX_COUNT: "0" 283 | # Workaround for entrypoint issue. See: https://github.com/apache/airflow/issues/16252 284 | command: 285 | - bash 286 | - -c 287 | - airflow 288 | 289 | # You can enable flower by adding "--profile flower" option e.g. docker-compose --profile flower up 290 | # or by explicitly targeted on the command line e.g. docker-compose up flower. 291 | # See: https://docs.docker.com/compose/profiles/ 292 | flower: 293 | <<: *airflow-common 294 | command: celery flower 295 | profiles: 296 | - flower 297 | ports: 298 | - 5553:5555 299 | healthcheck: 300 | test: ["CMD", "curl", "--fail", "http://localhost:5553/"] 301 | interval: 30s 302 | timeout: 10s 303 | retries: 5 304 | start_period: 30s 305 | restart: always 306 | depends_on: 307 | <<: *airflow-common-depends-on 308 | airflow-init: 309 | condition: service_completed_successfully 310 | 311 | volumes: 312 | postgres-db-volume: -------------------------------------------------------------------------------- /airflow-sql/profile/.user.yml: -------------------------------------------------------------------------------- 1 | id: 869bb9f2-dcd2-40fe-9088-705cd4679199 2 | -------------------------------------------------------------------------------- /airflow-sql/profile/profiles.yml: -------------------------------------------------------------------------------- 1 | warehouse: 2 | outputs: 3 | 4 | dev: 5 | type: postgres 6 | threads: 4 7 | host: 192.168.1.39 8 | port: 5432 9 | user: etl 10 | pass: demopass 11 | dbname: adventureworks 12 | schema: public 13 | 14 | target: dev 15 | 16 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/.user.yml: -------------------------------------------------------------------------------- 1 | id: 358f4a66-8e6d-402e-8d11-94858b0eb9e9 2 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "dbt.queryLimit": 500, 3 | "dbt.enableNewLineagePanel": true 4 | } -------------------------------------------------------------------------------- /airflow-sql/warehouse/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! This project is housed on a docker container. You can check the "airflow-sql-airflow-worker-1" container. dbt project is under the following /opt/dbt directory. 2 | 3 | ### You can issue following commands to confirm project is functional on docker. 4 | 5 | Try running the following commands: 6 | - dbt debug 7 | - dbt deps 8 | - dbt compile 9 | - dbt build 10 | 11 | 12 | ### Resources: 13 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 14 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 15 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 16 | - Find [dbt events](https://events.getdbt.com) near you 17 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 18 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/airflow-sql/warehouse/analyses/.gitkeep -------------------------------------------------------------------------------- /airflow-sql/warehouse/archive/stg_salesorderheadersalesreason.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'salesorderheadersalesreason') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | modifieddate, 9 | salesorderid, 10 | salesreasonid 11 | from source 12 | ) 13 | 14 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/archive/stg_salesreason.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_salesreason') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | salesreason, 9 | modifieddate, 10 | salesreasonid 11 | from source 12 | ) 13 | 14 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/archive/stg_store.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_store') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | demographics, 10 | storename, 11 | modifieddate, 12 | salespersonid, 13 | businessentityid 14 | from source 15 | ) 16 | 17 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/archive/testDate.sql: -------------------------------------------------------------------------------- 1 | with date_dimension as ( 2 | select * from {{ ref('stg_date') }} 3 | ), 4 | full_dt as ( 5 | {{ dbt_date.get_base_dates(start_date="2011-01-01", end_date="2014-12-31") }} 6 | ), 7 | full_dt_tr as ( 8 | select 9 | d.*, 10 | f.date_day as fulldt, 11 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", "UTC") }} as dulldtz, 12 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", source_tz="UTC") }} as dulldtzt, 13 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York") }} as test, 14 | --f.date_day AT TIME ZONE 'PST' AS "direct_pst", 15 | --f.date_day AT TIME ZONE 'UTC' AS "direct_utc", 16 | f.date_day::timestamp AT TIME ZONE 'UTC' AS "ts_utc" 17 | from 18 | date_dimension d 19 | left join full_dt f on d.date_day = cast(f.date_day as date) 20 | ) 21 | select 22 | {{ dbt_utils.generate_surrogate_key(['ts_utc']) }} as date_key, 23 | * 24 | From full_dt_tr -------------------------------------------------------------------------------- /airflow-sql/warehouse/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'warehouse' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'warehouse' 11 | 12 | vars: 13 | 'dbt_date:time_zone': 'America/New_York' 14 | 15 | flags: 16 | require_explicit_package_overrides_for_builtin_materializations: false 17 | # These configurations specify where dbt should look for different types of files. 18 | # The `model-paths` config, for example, states that models in this project can be 19 | # found in the "models/" directory. You probably won't need to change these! 20 | model-paths: ["models"] 21 | analysis-paths: ["analyses"] 22 | test-paths: ["tests"] 23 | seed-paths: ["seeds"] 24 | macro-paths: ["macros"] 25 | snapshot-paths: ["snapshots"] 26 | 27 | target-path: "target" # directory which will store compiled SQL files 28 | clean-targets: # directories to be removed by `dbt clean` 29 | - "target" 30 | - "dbt_packages" 31 | 32 | 33 | # Configuring models 34 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 35 | 36 | # In this example config, we tell dbt to build all models in the example/ 37 | # directory as views. These settings can be overridden in the individual model 38 | # files using the `{{ config(...) }}` macro. 39 | models: 40 | warehouse: 41 | # Config indicated by + and applies to all files under models/example/ 42 | staging: 43 | +materialized: table 44 | +schema: staging 45 | +elementary: 46 | schema_change: 47 | enabled: true 48 | freshness: 49 | enabled: true 50 | threshold: 24 hour 51 | volume_anomaly: 52 | enabled: true 53 | threshold: 0.2 # 20% deviation 54 | 55 | mart: 56 | +materialized: table 57 | +schema: sales 58 | 59 | seeds: 60 | warehouse: 61 | +schema: staging -------------------------------------------------------------------------------- /airflow-sql/warehouse/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/airflow-sql/warehouse/macros/.gitkeep -------------------------------------------------------------------------------- /airflow-sql/warehouse/macros/generate_schema_name.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_schema_name(custom_schema_name, node) -%} 2 | 3 | {%- set default_schema = target.schema -%} 4 | {%- if custom_schema_name is none -%} 5 | 6 | {{ default_schema }} 7 | 8 | {%- else -%} 9 | 10 | {{ custom_schema_name | trim }} 11 | 12 | {%- endif -%} 13 | 14 | {%- endmacro %} -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/address.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_address.addressid']) }} as address_key, 3 | stg_address.addressid, 4 | stg_address.city as city_name, 5 | stg_address.postalcode, 6 | stg_address.addressline1 || ' '|| coalesce(stg_address.addressline2, '') as Addressline, 7 | stg_stateprovince.statprovincename as state_name, 8 | stg_countryregion.countryregionname as country_name 9 | from {{ ref('stg_address') }} 10 | left join {{ ref('stg_stateprovince') }} on stg_address.stateprovinceid = stg_stateprovince.stateprovinceid 11 | left join {{ ref('stg_countryregion') }} on stg_stateprovince.countryregioncode = stg_countryregion.countryregioncode -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/customer.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_customer.customerid']) }} as customer_key, 3 | stg_customer.customerid, 4 | stg_person.businessentityid as personbusinessentityid, 5 | stg_person.title, 6 | stg_person.firstname || ' '|| lastname as customername, 7 | stg_person.houseownerflag, 8 | stg_person.occupation, 9 | stg_person.maritalstatus, 10 | stg_person.commutedistance, 11 | stg_person.education, 12 | stg_person.gender, 13 | stg_person.numbercarsowned, 14 | stg_person.totalchildren, 15 | stg_person.birthdate, 16 | stg_person.datefirstpurchase, 17 | stg_countryregion.countryregionname as country, 18 | stg_address.city, 19 | stg_stateprovince.statprovincename as state, 20 | stg_address.postalcode, 21 | stg_address.addressline1, 22 | stg_address.addressline2 23 | from {{ ref('stg_customer') }} 24 | left join {{ ref('stg_person') }} on stg_customer.personid = stg_person.businessentityid 25 | left join {{ ref('stg_entityaddress') }} on stg_entityaddress.businessentityid = stg_person.businessentityid 26 | left join {{ ref('stg_address') }} on stg_address.addressid = stg_entityaddress.addressid 27 | left join {{ ref('stg_stateprovince') }} on stg_stateprovince.stateprovinceid = stg_address.stateprovinceid 28 | left join {{ ref('stg_countryregion') }} on stg_countryregion.countryregioncode = stg_stateprovince.countryregioncode 29 | where persontype = 'IN' 30 | and addresstypeid = 2 -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/date.sql: -------------------------------------------------------------------------------- 1 | with date_dimension as ( 2 | select * from {{ ref('stg_date') }} 3 | ), 4 | full_dt as ( 5 | {{ dbt_date.get_base_dates(start_date="2011-01-01", end_date="2014-12-31") }} 6 | ), 7 | full_dt_tr as ( 8 | select 9 | d.*, 10 | f.date_day as fulldt, 11 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", "UTC") }} as dulldtz, 12 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", source_tz="UTC") }} as dulldtzt, 13 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York") }} as test, 14 | --f.date_day AT TIME ZONE 'PST' AS "direct_pst", 15 | f.date_day::timestamp "direct_dts", 16 | f.date_day::timestamp AT TIME ZONE 'UTC' AS "ts_utc" 17 | from 18 | date_dimension d 19 | left join full_dt f on d.date_day = cast(f.date_day as date) 20 | ) 21 | select 22 | {{ dbt_utils.generate_surrogate_key(['direct_dts']) }} as date_key, 23 | * 24 | From full_dt_tr -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/orderstatus.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_salesorderheader.status']) }} as order_status_key, 3 | status as order_status, 4 | case 5 | when status = 1 then 'in_process' 6 | when status = 2 then 'approved' 7 | when status = 3 then 'backordered' 8 | when status = 4 then 'rejected' 9 | when status = 5 then 'shipped' 10 | when status = 6 then 'cancelled' 11 | else 'no_status' 12 | end as order_status_name 13 | from {{ ref('stg_salesorderheader') }} -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/product.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_product.productid']) }} as product_key, 3 | stg_product.productid, 4 | stg_product.productname as product_name, 5 | stg_product.productnumber, 6 | stg_product.color, 7 | stg_product.daystomanufacture, 8 | stg_product.safetystocklevel, 9 | stg_product.standardcost, 10 | stg_productsubcategory.productsubcategory as product_subcategory_name, 11 | stg_productcategory.productcategory as product_category_name, 12 | stg_product.sellstartdate, 13 | stg_product.sellenddate 14 | from {{ ref('stg_product') }} 15 | left join {{ ref('stg_productsubcategory') }} on stg_product.productsubcategoryid = stg_productsubcategory.productsubcategoryid 16 | left join {{ ref('stg_productcategory') }} on stg_productsubcategory.productcategoryid = stg_productcategory.productcategoryid -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/sales.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_salesorderdetail.salesorderid', 'salesorderdetailid']) }} as sales_key, 3 | {{ dbt_utils.generate_surrogate_key(['productid']) }} as product_key, 4 | {{ dbt_utils.generate_surrogate_key(['customerid']) }} as customer_key, 5 | {{ dbt_utils.generate_surrogate_key(['creditcardid']) }} as creditcard_key, 6 | {{ dbt_utils.generate_surrogate_key(['shiptoaddressid']) }} as ship_address_key, 7 | {{ dbt_utils.generate_surrogate_key(['status']) }} as order_status_key, 8 | {{ dbt_utils.generate_surrogate_key(['orderdate']) }} as order_date_key, 9 | {{ dbt_utils.generate_surrogate_key(['shipdate']) }} as ship_date_key, 10 | {{ dbt_utils.generate_surrogate_key(['duedate']) }} as due_date_key, 11 | {{ dbt_utils.generate_surrogate_key(['territoryid']) }} as territory_key, 12 | {{ dbt_utils.ge}} 13 | orderdate, 14 | onlineorderflag, 15 | stg_salesorderdetail.unitpricediscount as unitpricediscount, 16 | stg_salesorderheader.salesordernumber, 17 | stg_salesorderdetail.salesorderid, 18 | stg_salesorderdetail.salesorderdetailid, 19 | stg_salesorderdetail.unitprice, 20 | stg_salesorderdetail.orderqty, 21 | stg_salesorderdetail.linetotal as revenue, 22 | stg_salesorderdetail.linetotal as salesamount, 23 | case when stg_salesorderdetail.unitpricediscount > 0 24 | then stg_salesorderdetail.linetotal * stg_salesorderdetail.unitpricediscount 25 | else stg_salesorderdetail.linetotal 26 | end as totaldiscount, 27 | stg_salesorderheader.taxamt 28 | from {{ ref('stg_salesorderdetail') }} 29 | inner join {{ ref('stg_salesorderheader') }} on stg_salesorderdetail.salesorderid = stg_salesorderheader.salesorderid -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: address 5 | description: contains address information for the customers. 6 | 7 | - name: customer 8 | description: contains customers details for company. 9 | 10 | - name: orderstatus 11 | description: contains order status details. 12 | 13 | - name: product 14 | description: contains product information and product hierarchy. 15 | 16 | - name: territory 17 | description: contains sales territory details 18 | 19 | - name: date 20 | description: date diemension for datatime analysis 21 | 22 | - name: sales 23 | description: contains sales transaction across s included in this data model. -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/mart/territory.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_salesterritory.territoryid']) }} as territory_key, 3 | territoryid, 4 | salesterritoryname as territoryname, 5 | "Group" as territory_group, 6 | countryregioncode, 7 | costytd, 8 | salesytd, 9 | costlastyear, 10 | saleslastyear, 11 | modifieddate 12 | from {{ ref('stg_salesterritory') }} -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | models: 3 | - name: stg_address 4 | description: This model represents the stage address table for the dim address. 5 | It serves as an intermediate table for loading and transforming address 6 | data from the AdventureWorks source database. The table includes columns 7 | such as city, rowguid, addressid, postalcode, addressline1, addressline2, 8 | modifieddate, spatiallocation, and stateprovinceid. This data can be 9 | further processed and loaded into the final dim address table for analysis 10 | and reporting purposes. 11 | columns: 12 | - name: city 13 | description: The name of the city where the address is located. This column 14 | provides information about the geographical location of the address 15 | and can be used for various analytical scenarios. For example, it can 16 | be used to analyze customer distribution across different cities, 17 | identify popular cities for business operations, or understand 18 | regional trends in sales or customer behavior. 19 | - name: rowguid 20 | description: "" 21 | - name: addressid 22 | data_tests: 23 | - unique 24 | - not_null 25 | description: "" 26 | - name: postalcode 27 | description: Zip Code 28 | - name: addressline1 29 | description: Address First Line 30 | - name: addressline2 31 | description: Address Second Line 32 | - name: modifieddate 33 | description: Address modified date 34 | - name: spatiallocation 35 | description: "" 36 | - name: stateprovinceid 37 | description: Foreign key for stateprovince 38 | data_tests: 39 | - relationships: 40 | to: ref('stg_stateprovince') 41 | field: stateprovinceid 42 | 43 | 44 | 45 | - name: stg_customer 46 | columns: 47 | - name: storeid 48 | description: "" 49 | - name: rowguid 50 | description: "" 51 | - name: personid 52 | description: "" 53 | - name: customerid 54 | description: The primary key for this table 55 | data_tests: 56 | - unique 57 | - not_null 58 | - name: territoryid 59 | description: "" 60 | - name: modifieddate 61 | description: "" 62 | - name: accountnumber 63 | description: "" 64 | 65 | - name: stg_salesorderdetail 66 | columns: 67 | - name: orderqty 68 | description: " " 69 | - name: linetotal 70 | description: " " 71 | - name: productid 72 | description: " " 73 | - name: unitprice 74 | description: " " 75 | - name: modifieddate 76 | description: " " 77 | - name: salesorderid 78 | description: " " 79 | data_tests: 80 | - not_null 81 | - name: specialofferid 82 | description: " " 83 | - name: unitpricediscount 84 | description: " " 85 | - name: salesorderdetailid 86 | description: The column is part of the primary key for this table 87 | data_tests: 88 | - not_null 89 | - name: carriertrackingnumber 90 | description: " " 91 | - name: stg_salesorderheader 92 | columns: 93 | - name: status 94 | description: " " 95 | - name: taxamt 96 | description: " " 97 | - name: comment 98 | description: " " 99 | - name: duedate 100 | description: " " 101 | - name: freight 102 | description: " " 103 | - name: rowguid 104 | description: " " 105 | - name: shipdate 106 | description: " " 107 | - name: subtotal 108 | description: " " 109 | - name: totaldue 110 | description: " " 111 | - name: orderdate 112 | description: " " 113 | - name: customerid 114 | description: " " 115 | - name: territoryid 116 | description: " " 117 | - name: creditcardid 118 | description: " " 119 | - name: modifieddate 120 | description: " " 121 | - name: salesorderid 122 | description: The primary key for this table 123 | data_tests: 124 | - unique 125 | - not_null 126 | - name: shipmethodid 127 | description: " " 128 | - name: salespersonid 129 | description: " " 130 | - name: currencyrateid 131 | description: " " 132 | - name: revisionnumber 133 | description: " " 134 | - name: billtoaddressid 135 | description: " " 136 | - name: shiptoaddressid 137 | description: " " 138 | - name: salesordernumber 139 | description: " " 140 | - name: creditcardapprovalcode 141 | description: " " 142 | - name: stg_countryregion 143 | columns: 144 | - name: modifieddate 145 | description: " " 146 | - name: countryregioncode 147 | description: The primary key for this table 148 | data_tests: 149 | - unique 150 | - not_null 151 | - name: countryregionname 152 | description: " " 153 | - name: stg_product 154 | columns: 155 | - name: size 156 | description: " " 157 | - name: class 158 | description: " " 159 | - name: color 160 | description: " " 161 | - name: style 162 | description: " " 163 | - name: weight 164 | description: " " 165 | - name: rowguid 166 | description: " " 167 | - name: listprice 168 | description: " " 169 | - name: productid 170 | description: The primary key for this table 171 | data_tests: 172 | - unique 173 | - not_null 174 | - name: productline 175 | description: " " 176 | - name: productname 177 | description: " " 178 | - name: sellenddate 179 | description: " " 180 | - name: modifieddate 181 | description: " " 182 | - name: reorderpoint 183 | description: " " 184 | - name: standardcost 185 | description: " " 186 | - name: productnumber 187 | description: " " 188 | - name: sellstartdate 189 | description: " " 190 | - name: productmodelid 191 | description: " " 192 | - name: discontinueddate 193 | description: " " 194 | - name: safetystocklevel 195 | description: " " 196 | - name: daystomanufacture 197 | description: " " 198 | - name: sizeunitmeasurecode 199 | description: " " 200 | - name: productsubcategoryid 201 | description: " " 202 | - name: weightunitmeasurecode 203 | description: " " 204 | - name: stg_person 205 | columns: 206 | - name: title 207 | description: " " 208 | - name: suffix 209 | description: " " 210 | - name: rowguid 211 | description: " " 212 | - name: lastname 213 | description: " " 214 | - name: firstname 215 | description: " " 216 | - name: middlename 217 | description: " " 218 | - name: persontype 219 | description: " " 220 | - name: demographics 221 | description: " " 222 | - name: modifieddate 223 | description: " " 224 | - name: emailpromotion 225 | description: " " 226 | - name: businessentityid 227 | description: The primary key for this table 228 | data_tests: 229 | - unique 230 | - not_null 231 | - name: additionalcontactinfo 232 | description: " " 233 | - name: stg_salesterritory 234 | columns: 235 | - name: group 236 | description: " " 237 | - name: costytd 238 | description: " " 239 | - name: rowguid 240 | description: " " 241 | - name: salesytd 242 | description: " " 243 | - name: territoryid 244 | description: The primary key for this table 245 | data_tests: 246 | - unique 247 | - not_null 248 | - name: costlastyear 249 | description: " " 250 | - name: modifieddate 251 | description: " " 252 | - name: saleslastyear 253 | description: " " 254 | - name: countryregioncode 255 | description: " " 256 | - name: salesterritoryname 257 | description: " " 258 | - name: stg_productsubcategory 259 | columns: 260 | - name: rowguid 261 | description: " " 262 | - name: modifieddate 263 | description: " " 264 | - name: productcategoryid 265 | description: "" 266 | - name: productsubcategory 267 | description: " " 268 | - name: productsubcategoryid 269 | description: The primary key for this table 270 | data_tests: 271 | - unique 272 | - not_null 273 | - name: stg_stateprovince 274 | columns: 275 | - name: rowguid 276 | description: " " 277 | - name: territoryid 278 | description: " " 279 | - name: modifieddate 280 | description: " " 281 | - name: stateprovinceid 282 | description: The primary key for this table 283 | data_tests: 284 | - unique 285 | - not_null 286 | - name: statprovincename 287 | description: " " 288 | - name: countryregioncode 289 | description: " " 290 | - name: stateprovincecode 291 | description: " " 292 | - name: stg_productcategory 293 | description: This table stores product category data 294 | columns: 295 | - name: productcategoryid 296 | description: The primary key for this table 297 | data_tests: 298 | - unique 299 | - not_null 300 | - name: productcategory 301 | description: Productcategory label 302 | data_tests: 303 | - accepted_values: 304 | values: 305 | - Bikes 306 | - Components 307 | - Clothing 308 | - Accessories 309 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/source.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: adventureworks 5 | schema: source 6 | database: adventureworks 7 | 8 | tables: 9 | - name: vw_address 10 | identifier: vw_address 11 | 12 | - name: Customer 13 | identifier: customer 14 | - name: SalesOrderDetail 15 | identifier: salesorderdetail 16 | - name: salesorderheader 17 | - name: vw_countryregion 18 | - name: vw_person 19 | - name: vw_product 20 | - name: vw_productcategory 21 | - name: vw_productsubcategory 22 | - name: vw_salesterritory 23 | - name: vw_stateprovince 24 | - name: vw_salesorderheader 25 | - name: BusinessEntityAddress 26 | identifier: businessentityaddress 27 | - name: src_customer 28 | - name: proudct -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_address.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | select * from {{ source('adventureworks', 'vw_address') }} 3 | ), 4 | 5 | renamed as ( 6 | select 7 | city, 8 | rowguid, 9 | addressid, 10 | postalcode, 11 | addressline1, 12 | addressline2, 13 | modifieddate, 14 | spatiallocation, 15 | stateprovinceid 16 | from source 17 | ) 18 | 19 | select * from renamed 20 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_country.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_countryregion') }} 4 | ), 5 | 6 | isocountry as ( 7 | 8 | select * from {{ ref('country_codes') }} 9 | ), 10 | renamed as ( 11 | 12 | select 13 | modifieddate, 14 | countryregioncode, 15 | countryregionname 16 | from source 17 | ), 18 | isorename as ( 19 | 20 | select 21 | "Alpha-2 code" as isocode2, 22 | "Alpha-3 code" as isocode3, 23 | "Numeric" as uncode 24 | from isocountry 25 | ) 26 | 27 | select 28 | modifieddate, 29 | countryregioncode, 30 | countryregionname, 31 | isocode3, 32 | uncode 33 | from renamed 34 | left join isorename on isorename.isocode2 = renamed.countryregioncode -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_countryregion.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_countryregion') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | modifieddate, 9 | countryregioncode, 10 | countryregionname 11 | from source 12 | ) 13 | 14 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_customer.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'Customer') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | storeid, 9 | rowguid, 10 | personid, 11 | customerid, 12 | territoryid, 13 | modifieddate, 14 | accountnumber 15 | from source 16 | ) 17 | 18 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_date.sql: -------------------------------------------------------------------------------- 1 | with date_dim as ( 2 | 3 | 4 | {{ dbt_date.get_date_dimension("2011-01-01", "2014-12-31") }} 5 | 6 | ) 7 | select 8 | * 9 | from date_dim -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_entityaddress.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'BusinessEntityAddress') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | addresstypeid, 9 | businessentityid, 10 | modifieddate, 11 | addressid 12 | from source 13 | ) 14 | 15 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_person.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_person') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | title, 9 | suffix, 10 | rowguid, 11 | lastname, 12 | firstname, 13 | middlename, 14 | persontype, 15 | modifieddate, 16 | emailpromotion, 17 | businessentityid, 18 | houseownerflag, 19 | occupation, 20 | gender, 21 | maritalstatus, 22 | commutedistance, 23 | education, 24 | numbercarsowned, 25 | totalchildren, 26 | birthdate, 27 | datefirstpurchase 28 | from source 29 | ) 30 | 31 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_product.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_product') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | "Size", 9 | "Class", 10 | color, 11 | "Style", 12 | weight, 13 | rowguid, 14 | listprice, 15 | productid, 16 | productline, 17 | productname, 18 | sellenddate AS sellenddate, 19 | modifieddate, 20 | reorderpoint, 21 | standardcost, 22 | productnumber, 23 | sellstartdate AS sellstartdate, 24 | productmodelid, 25 | discontinueddate, 26 | safetystocklevel, 27 | daystomanufacture, 28 | sizeunitmeasurecode, 29 | productsubcategoryid 30 | from source 31 | ) 32 | 33 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_productcategory.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_productcategory') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | modifieddate, 10 | productcategory, 11 | productcategoryid 12 | from source 13 | ) 14 | 15 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_productsubcategory.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_productsubcategory') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | modifieddate, 10 | productcategoryid, 11 | productsubcategory, 12 | productsubcategoryid 13 | from source 14 | ) 15 | 16 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_salesorderdetail.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'SalesOrderDetail') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | orderqty, 10 | linetotal, 11 | productid, 12 | unitprice, 13 | modifieddate, 14 | salesorderid, 15 | specialofferid, 16 | unitpricediscount, 17 | salesorderdetailid, 18 | carriertrackingnumber 19 | from source 20 | ) 21 | 22 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_salesorderheader.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_salesorderheader') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | status, 9 | onlineorderflag, 10 | taxamt, 11 | purchaseordernumber, 12 | "Comment", --First letter is uppercase therefore in double quoutes 13 | duedate::timestamp AS duedate, 14 | freight, 15 | rowguid, 16 | shipdate::timestamp AS shipdate, 17 | subtotal, 18 | totaldue, 19 | orderdate::timestamp AS orderdate, 20 | customerid, 21 | territoryid, 22 | creditcardid, 23 | modifieddate, 24 | salesorderid, 25 | shipmethodid, 26 | salespersonid, 27 | currencyrateid, 28 | revisionnumber, 29 | billtoaddressid, 30 | shiptoaddressid, 31 | salesordernumber, 32 | creditcardapprovalcode 33 | from source 34 | ) 35 | 36 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_salesterritory.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_salesterritory') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | "Group", 9 | costytd, 10 | rowguid, 11 | salesytd, 12 | territoryid, 13 | costlastyear, 14 | modifieddate, 15 | saleslastyear, 16 | countryregioncode, 17 | salesterritoryname 18 | from source 19 | ) 20 | 21 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/models/staging/stg_stateprovince.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('adventureworks', 'vw_stateprovince') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | territoryid, 10 | modifieddate, 11 | stateprovinceid, 12 | statprovincename, 13 | countryregioncode, 14 | stateprovincecode 15 | from source 16 | ) 17 | 18 | select * from renamed -------------------------------------------------------------------------------- /airflow-sql/warehouse/package-lock.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | - package: dbt-labs/codegen 5 | version: 0.9.0 6 | - package: calogica/dbt_date 7 | version: 0.10.0 8 | sha1_hash: ca45e02a7fef319df0fe35d53802371ea0fbf08a 9 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | 5 | - package: dbt-labs/codegen 6 | version: 0.9.0 7 | 8 | - package: calogica/dbt_date 9 | version: 0.10.0 -------------------------------------------------------------------------------- /airflow-sql/warehouse/profiles.yml: -------------------------------------------------------------------------------- 1 | warehouse: 2 | outputs: 3 | 4 | dev: 5 | type: postgres 6 | threads: 4 7 | host: 192.168.1.39 8 | port: 5432 9 | user: "{{ env_var('PGUID') }}" 10 | pass: "{{ env_var('PGPASS') }}" 11 | dbname: adventureworks 12 | schema: public 13 | 14 | target: dev 15 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/airflow-sql/warehouse/seeds/.gitkeep -------------------------------------------------------------------------------- /airflow-sql/warehouse/seeds/country_codes.csv: -------------------------------------------------------------------------------- 1 | English short name,Alpha-2 code,Alpha-3 code,Numeric 2 | Afghanistan,AF,AFG,4 3 | Albania,AL,ALB,8 4 | Algeria,DZ,DZA,12 5 | American Samoa,AS,ASM,16 6 | Andorra,AD,AND,20 7 | Angola,AO,AGO,24 8 | Anguilla,AI,AIA,660 9 | Antarctica,AQ,ATA,10 10 | Antigua and Barbuda,AG,ATG,28 11 | Argentina,AR,ARG,32 12 | Armenia,AM,ARM,51 13 | Aruba,AW,ABW,533 14 | Australia,AU,AUS,36 15 | Austria,AT,AUT,40 16 | Azerbaijan,AZ,AZE,31 17 | Bahamas (the),BS,BHS,44 18 | Bahrain,BH,BHR,48 19 | Bangladesh,BD,BGD,50 20 | Barbados,BB,BRB,52 21 | Belarus,BY,BLR,112 22 | Belgium,BE,BEL,56 23 | Belize,BZ,BLZ,84 24 | Benin,BJ,BEN,204 25 | Bermuda,BM,BMU,60 26 | Bhutan,BT,BTN,64 27 | Bolivia (Plurinational State of),BO,BOL,68 28 | "Bonaire, Sint Eustatius and Saba",BQ,BES,535 29 | Bosnia and Herzegovina,BA,BIH,70 30 | Botswana,BW,BWA,72 31 | Bouvet Island,BV,BVT,74 32 | Brazil,BR,BRA,76 33 | British Indian Ocean Territory (the),IO,IOT,86 34 | Brunei Darussalam,BN,BRN,96 35 | Bulgaria,BG,BGR,100 36 | Burkina Faso,BF,BFA,854 37 | Burundi,BI,BDI,108 38 | Cabo Verde,CV,CPV,132 39 | Cambodia,KH,KHM,116 40 | Cameroon,CM,CMR,120 41 | Canada,CA,CAN,124 42 | Cayman Islands (the),KY,CYM,136 43 | Central African Republic (the),CF,CAF,140 44 | Chad,TD,TCD,148 45 | Chile,CL,CHL,152 46 | China,CN,CHN,156 47 | Christmas Island,CX,CXR,162 48 | Cocos (Keeling) Islands (the),CC,CCK,166 49 | Colombia,CO,COL,170 50 | Comoros (the),KM,COM,174 51 | Congo (the Democratic Republic of the),CD,COD,180 52 | Congo (the),CG,COG,178 53 | Cook Islands (the),CK,COK,184 54 | Costa Rica,CR,CRI,188 55 | Croatia,HR,HRV,191 56 | Cuba,CU,CUB,192 57 | Cyprus,CY,CYP,196 58 | Czechia,CZ,CZE,203 59 | Denmark,DK,DNK,208 60 | Djibouti,DJ,DJI,262 61 | Dominica,DM,DMA,212 62 | Dominican Republic (the),DO,DOM,214 63 | Ecuador,EC,ECU,218 64 | Egypt,EG,EGY,818 65 | El Salvador,SV,SLV,222 66 | Equatorial Guinea,GQ,GNQ,226 67 | Eritrea,ER,ERI,232 68 | Estonia,EE,EST,233 69 | Eswatini,SZ,SWZ,748 70 | Ethiopia,ET,ETH,231 71 | Falkland Islands (the) [Malvinas],FK,FLK,238 72 | Faroe Islands (the),FO,FRO,234 73 | Fiji,FJ,FJI,242 74 | Finland,FI,FIN,246 75 | France,FR,FRA,250 76 | French Guiana,GF,GUF,254 77 | French Polynesia,PF,PYF,258 78 | French Southern Territories (the),TF,ATF,260 79 | Gabon,GA,GAB,266 80 | Gambia (the),GM,GMB,270 81 | Georgia,GE,GEO,268 82 | Germany,DE,DEU,276 83 | Ghana,GH,GHA,288 84 | Gibraltar,GI,GIB,292 85 | Greece,GR,GRC,300 86 | Greenland,GL,GRL,304 87 | Grenada,GD,GRD,308 88 | Guadeloupe,GP,GLP,312 89 | Guam,GU,GUM,316 90 | Guatemala,GT,GTM,320 91 | Guernsey,GG,GGY,831 92 | Guinea,GN,GIN,324 93 | Guinea-Bissau,GW,GNB,624 94 | Guyana,GY,GUY,328 95 | Haiti,HT,HTI,332 96 | Heard Island and McDonald Islands,HM,HMD,334 97 | Holy See (the),VA,VAT,336 98 | Honduras,HN,HND,340 99 | Hong Kong,HK,HKG,344 100 | Hungary,HU,HUN,348 101 | Iceland,IS,ISL,352 102 | India,IN,IND,356 103 | Indonesia,ID,IDN,360 104 | Iran (Islamic Republic of),IR,IRN,364 105 | Iraq,IQ,IRQ,368 106 | Ireland,IE,IRL,372 107 | Isle of Man,IM,IMN,833 108 | Israel,IL,ISR,376 109 | Italy,IT,ITA,380 110 | Jamaica,JM,JAM,388 111 | Japan,JP,JPN,392 112 | Jersey,JE,JEY,832 113 | Jordan,JO,JOR,400 114 | Kazakhstan,KZ,KAZ,398 115 | Kenya,KE,KEN,404 116 | Kiribati,KI,KIR,296 117 | Korea (the Democratic People's Republic of),KP,PRK,408 118 | Korea (the Republic of),KR,KOR,410 119 | Kuwait,KW,KWT,414 120 | Kyrgyzstan,KG,KGZ,417 121 | Lao People's Democratic Republic (the),LA,LAO,418 122 | Latvia,LV,LVA,428 123 | Lebanon,LB,LBN,422 124 | Lesotho,LS,LSO,426 125 | Liberia,LR,LBR,430 126 | Libya,LY,LBY,434 127 | Liechtenstein,LI,LIE,438 128 | Lithuania,LT,LTU,440 129 | Luxembourg,LU,LUX,442 130 | Macao,MO,MAC,446 131 | Madagascar,MG,MDG,450 132 | Malawi,MW,MWI,454 133 | Malaysia,MY,MYS,458 134 | Maldives,MV,MDV,462 135 | Mali,ML,MLI,466 136 | Malta,MT,MLT,470 137 | Marshall Islands (the),MH,MHL,584 138 | Martinique,MQ,MTQ,474 139 | Mauritania,MR,MRT,478 140 | Mauritius,MU,MUS,480 141 | Mayotte,YT,MYT,175 142 | Mexico,MX,MEX,484 143 | Micronesia (Federated States of),FM,FSM,583 144 | Moldova (the Republic of),MD,MDA,498 145 | Monaco,MC,MCO,492 146 | Mongolia,MN,MNG,496 147 | Montenegro,ME,MNE,499 148 | Montserrat,MS,MSR,500 149 | Morocco,MA,MAR,504 150 | Mozambique,MZ,MOZ,508 151 | Myanmar,MM,MMR,104 152 | Namibia,NA,NAM,516 153 | Nauru,NR,NRU,520 154 | Nepal,NP,NPL,524 155 | Netherlands (Kingdom of the),NL,NLD,528 156 | New Caledonia,NC,NCL,540 157 | New Zealand,NZ,NZL,554 158 | Nicaragua,NI,NIC,558 159 | Niger (the),NE,NER,562 160 | Nigeria,NG,NGA,566 161 | Niue,NU,NIU,570 162 | Norfolk Island,NF,NFK,574 163 | North Macedonia,MK,MKD,807 164 | Northern Mariana Islands (the),MP,MNP,580 165 | Norway,NO,NOR,578 166 | Oman,OM,OMN,512 167 | Pakistan,PK,PAK,586 168 | Palau,PW,PLW,585 169 | "Palestine, State of",PS,PSE,275 170 | Panama,PA,PAN,591 171 | Papua New Guinea,PG,PNG,598 172 | Paraguay,PY,PRY,600 173 | Peru,PE,PER,604 174 | Philippines (the),PH,PHL,608 175 | Pitcairn,PN,PCN,612 176 | Poland,PL,POL,616 177 | Portugal,PT,PRT,620 178 | Puerto Rico,PR,PRI,630 179 | Qatar,QA,QAT,634 180 | Romania,RO,ROU,642 181 | Russian Federation (the),RU,RUS,643 182 | Rwanda,RW,RWA,646 183 | "Saint Helena, Ascension and Tristan da Cunha",SH,SHN,654 184 | Saint Kitts and Nevis,KN,KNA,659 185 | Saint Lucia,LC,LCA,662 186 | Saint Martin (French part),MF,MAF,663 187 | Saint Pierre and Miquelon,PM,SPM,666 188 | Saint Vincent and the Grenadines,VC,VCT,670 189 | Samoa,WS,WSM,882 190 | San Marino,SM,SMR,674 191 | Sao Tome and Principe,ST,STP,678 192 | Saudi Arabia,SA,SAU,682 193 | Senegal,SN,SEN,686 194 | Serbia,RS,SRB,688 195 | Seychelles,SC,SYC,690 196 | Sierra Leone,SL,SLE,694 197 | Singapore,SG,SGP,702 198 | Sint Maarten (Dutch part),SX,SXM,534 199 | Slovakia,SK,SVK,703 200 | Slovenia,SI,SVN,705 201 | Solomon Islands,SB,SLB,90 202 | Somalia,SO,SOM,706 203 | South Africa,ZA,ZAF,710 204 | South Georgia and the South Sandwich Islands,GS,SGS,239 205 | South Sudan,SS,SSD,728 206 | Spain,ES,ESP,724 207 | Sri Lanka,LK,LKA,144 208 | Sudan (the),SD,SDN,729 209 | Suriname,SR,SUR,740 210 | Svalbard and Jan Mayen,SJ,SJM,744 211 | Sweden,SE,SWE,752 212 | Switzerland,CH,CHE,756 213 | Syrian Arab Republic (the),SY,SYR,760 214 | Taiwan (Province of China),TW,TWN,158 215 | Tajikistan,TJ,TJK,762 216 | "Tanzania, the United Republic of",TZ,TZA,834 217 | Thailand,TH,THA,764 218 | Timor-Leste,TL,TLS,626 219 | Togo,TG,TGO,768 220 | Tokelau,TK,TKL,772 221 | Tonga,TO,TON,776 222 | Trinidad and Tobago,TT,TTO,780 223 | Tunisia,TN,TUN,788 224 | Turkmenistan,TM,TKM,795 225 | Turks and Caicos Islands (the),TC,TCA,796 226 | Tuvalu,TV,TUV,798 227 | Uganda,UG,UGA,800 228 | Ukraine,UA,UKR,804 229 | United Arab Emirates (the),AE,ARE,784 230 | United Kingdom of Great Britain and Northern Ireland (the),GB,GBR,826 231 | United States Minor Outlying Islands (the),UM,UMI,581 232 | United States of America (the),US,USA,840 233 | Uruguay,UY,URY,858 234 | Uzbekistan,UZ,UZB,860 235 | Vanuatu,VU,VUT,548 236 | Venezuela (Bolivarian Republic of),VE,VEN,862 237 | Viet Nam,VN,VNM,704 238 | Virgin Islands (British),VG,VGB,92 239 | Virgin Islands (U.S.),VI,VIR,850 240 | Wallis and Futuna,WF,WLF,876 241 | Western Sahara*,EH,ESH,732 242 | Yemen,YE,YEM,887 243 | Zambia,ZM,ZMB,894 244 | Zimbabwe,ZW,ZWE,716 245 | 246 | -------------------------------------------------------------------------------- /airflow-sql/warehouse/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/airflow-sql/warehouse/snapshots/.gitkeep -------------------------------------------------------------------------------- /airflow-sql/warehouse/snapshots/product_snapshot.sql: -------------------------------------------------------------------------------- 1 | {% snapshot product_snapshot %} 2 | 3 | {{ 4 | config( 5 | target_schema='snapshots', 6 | unique_key='product_id', 7 | strategy='timestamp', 8 | updated_at='modified_date', 9 | ) 10 | }} 11 | 12 | select * from {{ source('adventureworks', 'product') }} 13 | 14 | {% endsnapshot %} -------------------------------------------------------------------------------- /airflow-sql/warehouse/snapshots/source.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: adventureworks 5 | schema: staging 6 | database: adventureworks 7 | 8 | tables: 9 | - name: product -------------------------------------------------------------------------------- /airflow-sql/warehouse/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/airflow-sql/warehouse/tests/.gitkeep -------------------------------------------------------------------------------- /dbtDAGs.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/dbtDAGs.png -------------------------------------------------------------------------------- /sql/dimCustomer.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_customer.customerid']) }} as customer_key, 3 | stg_customer.customerid, 4 | stg_person.businessentityid as personbusinessentityid, 5 | stg_person.title, 6 | stg_person.firstname || ' '|| lastname as fullname, 7 | stg_person.houseownerflag, 8 | stg_person.occupation, 9 | stg_person.maritalstatus, 10 | stg_person.commutedistance, 11 | stg_person.education, 12 | stg_person.numbercarsowned, 13 | stg_person.totalchildren, 14 | stg_person.birthdate, 15 | stg_person.datefirstpurchase, 16 | stg_countryregion.countryregionname as country, 17 | stg_address.city, 18 | stg_stateprovince.statprovincename as state, 19 | stg_address.postalcode, 20 | stg_address.addressline1, 21 | stg_address.addressline2 22 | from {{ ref('stg_customer') }} 23 | left join {{ ref('stg_person') }} on stg_customer.personid = stg_person.businessentityid 24 | left join {{ ref('stg_entityaddress') }} on stg_entityaddress.businessentityid = stg_person.businessentityid 25 | left join {{ ref('stg_address') }} on stg_address.addressid = stg_entityaddress.addressid 26 | left join {{ ref('stg_stateprovince') }} on stg_stateprovince.stateprovinceid = stg_address.stateprovinceid 27 | left join {{ ref('stg_countryregion') }} on stg_countryregion.countryregioncode = stg_stateprovince.countryregioncode 28 | where persontype = 'IN' 29 | -------------------------------------------------------------------------------- /sql/dimDate.sql: -------------------------------------------------------------------------------- 1 | with date_dimension as ( 2 | select * from {{ ref('stg_date') }} 3 | ), 4 | full_dt as ( 5 | {{ dbt_date.get_base_dates(start_date="2011-01-01", end_date="2014-12-31") }} 6 | ), 7 | full_dt_tr as ( 8 | select 9 | d.*, 10 | f.date_day as fulldt, 11 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", "UTC") }} as dulldtz, 12 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", source_tz="UTC") }} as dulldtzt, 13 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York") }} as test, 14 | --f.date_day AT TIME ZONE 'PST' AS "direct_pst", 15 | f.date_day::timestamp "direct_dts", 16 | f.date_day::timestamp AT TIME ZONE 'UTC' AS "ts_utc" 17 | from 18 | date_dimension d 19 | left join full_dt f on d.date_day = cast(f.date_day as date) 20 | ) 21 | select 22 | {{ dbt_utils.generate_surrogate_key(['direct_dts']) }} as date_key, 23 | * 24 | From full_dt_tr 25 | -------------------------------------------------------------------------------- /sql/readme.md: -------------------------------------------------------------------------------- 1 | # SQL Server table and views used as the source for data ingestion 2 | Below is the complete list of table and views used as our source for the dbt proejct. Views SQL Script are available in this folder. 3 | 4 | We use a transactional database, SQL Server, [AdventureWorks2019](https://learn.microsoft.com/en-us/sql/samples/adventureworks-install-configure?view=sql-server-ver16&tabs=ssms) as our source. We extract and load data with an EL tool [Airbyte](https://www.youtube.com/watch?v=2FvMa7vaxDY&t). 5 | 6 | ## Source tables/views used from AdventureWorks datatabase 7 | | schemaname | tablename | type | 8 | |------------ |----------------------------- |------- | 9 | | source | address | table | 10 | | source | customer | table | 11 | | source | salesorderdetail | table | 12 | | source | salesorderheadersalesreason | table | 13 | | source | salesorderheader | table | 14 | | source | vw_countryregion | view | 15 | | source | vw_product | view | 16 | | source | vw_productcategory | view | 17 | | source | vw_person | view | 18 | | source | vw_store | view | 19 | | source | vw_salesreason | view | 20 | | source | vw_salesterritory | view | 21 | | source | vw_productsubcategory | view | 22 | | source | vw_stateprovince | view | 23 | -------------------------------------------------------------------------------- /sql/stg_entityaddress.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'businessentityaddress') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | addresstypeid, 9 | businessentityid, 10 | modifieddate, 11 | addressid 12 | from source 13 | ) 14 | 15 | select * from renamed 16 | -------------------------------------------------------------------------------- /sql/stg_salesorderheader.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'salesorderheader') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | status, 9 | onlineorderflag, 10 | taxamt, 11 | purchaseordernumber, 12 | "Comment", --First letter is uppercase therefore in double quoutes 13 | duedate::timestamp AS duedate, 14 | freight, 15 | rowguid, 16 | shipdate::timestamp AS shipdate, 17 | subtotal, 18 | totaldue, 19 | orderdate::timestamp AS orderdate, 20 | customerid, 21 | territoryid, 22 | creditcardid, 23 | modifieddate, 24 | salesorderid, 25 | shipmethodid, 26 | salespersonid, 27 | currencyrateid, 28 | revisionnumber, 29 | billtoaddressid, 30 | shiptoaddressid, 31 | salesordernumber, 32 | creditcardapprovalcode 33 | from source 34 | ) 35 | 36 | select * from renamed 37 | -------------------------------------------------------------------------------- /sql/vw_countryregion.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_countryregion] Script Date: 6/3/2023 10:41:07 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | 13 | Create View [dbo].[vw_countryregion] 14 | AS 15 | SELECT [CountryRegionCode] 16 | ,cast([Name] as varchar(50)) AS CountryRegionName 17 | ,[ModifiedDate] 18 | FROM [AdventureWorks2019].[Person].[CountryRegion] 19 | GO 20 | 21 | 22 | -------------------------------------------------------------------------------- /sql/vw_person.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_person] Script Date: 7/25/2023 4:13:05 PM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | 12 | /****** Script for SelectTopNRows command from SSMS ******/ 13 | 14 | 15 | ALTER View [dbo].[vw_person] 16 | AS 17 | 18 | SELECT [BusinessEntityID] 19 | ,[PersonType] 20 | ,[NameStyle] 21 | ,[Title] 22 | ,cast([FirstName] as varchar(50)) AS FirstName 23 | ,cast([MiddleName] as varchar(50)) AS MiddleName 24 | ,cast([LastName] as varchar(50)) AS LastName 25 | ,[Suffix] 26 | ,[EmailPromotion] 27 | ,[rowguid] 28 | ,[ModifiedDate] 29 | , cast(CONVERT(datetime, LEFT(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";BirthDate','varchar(20)'), 10)) as varchar(20)) AS [BirthDate], 30 | cast(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";MaritalStatus','varchar(1)') as varchar(1)) AS [MaritalStatus], 31 | cast(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";Gender','varchar(1)') AS varchar(1)) AS [Gender], 32 | cast(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";TotalChildren','int') AS int) AS [TotalChildren], 33 | cast(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";NumberChildrenAtHome','int') AS int) AS [NumberChildrenAtHome], 34 | CAST(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";HomeOwnerFlag','int') AS varchar(1)) AS [HouseOwnerFlag], 35 | cast(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";NumberCarsOwned','int') AS INT) AS [NumberCarsOwned], 36 | CONVERT(datetime, LEFT(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";DateFirstPurchase','varchar(20)'), 10)) AS [DateFirstPurchase], 37 | cast(Survey.ref.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";CommuteDistance','varchar(15)') AS varchar(15)) AS [CommuteDistance], 38 | cast(Person.Demographics.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";(IndividualSurvey/Education)[1]','varchar(40)') AS varchar(40)) as [Education], 39 | cast(Person.Demographics.value(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";(IndividualSurvey/Occupation)[1]','varchar(40)') AS varchar(40)) as [Occupation] 40 | FROM [AdventureWorks2019].[Person].[Person] 41 | cross apply Person.[Demographics].nodes(N'declare default element namespace "http://schemas.microsoft.com/sqlserver/2004/07/adventure-works/IndividualSurvey";IndividualSurvey') AS Survey(ref) 42 | GO 43 | 44 | 45 | -------------------------------------------------------------------------------- /sql/vw_product.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_product] Script Date: 6/3/2023 10:48:26 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | 13 | Create View [dbo].[vw_product] 14 | AS 15 | SELECT [ProductID] 16 | ,cast([Name] as varchar(50)) AS ProductName 17 | ,[ProductNumber] 18 | ,[MakeFlag] 19 | ,[FinishedGoodsFlag] 20 | ,[Color] 21 | ,[SafetyStockLevel] 22 | ,[ReorderPoint] 23 | ,[StandardCost] 24 | ,[ListPrice] 25 | ,[Size] 26 | ,[SizeUnitMeasureCode] 27 | ,[WeightUnitMeasureCode] 28 | ,[Weight] 29 | ,[DaysToManufacture] 30 | ,[ProductLine] 31 | ,[Class] 32 | ,[Style] 33 | ,[ProductSubcategoryID] 34 | ,[ProductModelID] 35 | ,[SellStartDate] 36 | ,[SellEndDate] 37 | ,[DiscontinuedDate] 38 | ,[rowguid] 39 | ,[ModifiedDate] 40 | FROM [AdventureWorks2019].[Production].[Product] 41 | GO 42 | 43 | 44 | -------------------------------------------------------------------------------- /sql/vw_productcategory.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_productcategory] Script Date: 6/3/2023 10:48:44 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | 13 | CREATE View [dbo].[vw_productcategory] 14 | AS 15 | SELECT [ProductCategoryID] 16 | ,cast([Name] as varchar(50)) AS ProductCategory 17 | ,[rowguid] 18 | ,[ModifiedDate] 19 | FROM [AdventureWorks2019].[Production].[ProductCategory] 20 | GO 21 | 22 | 23 | -------------------------------------------------------------------------------- /sql/vw_productsubcategory.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_productsubcategory] Script Date: 6/3/2023 10:49:01 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | Create View [dbo].[vw_productsubcategory] 13 | AS 14 | SELECT [ProductSubcategoryID] 15 | ,[ProductCategoryID] 16 | ,cast([Name] as varchar(50)) AS ProductSubCategory 17 | ,[rowguid] 18 | ,[ModifiedDate] 19 | FROM [AdventureWorks2019].[Production].[ProductSubcategory] 20 | GO 21 | 22 | 23 | -------------------------------------------------------------------------------- /sql/vw_salesorderheader.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_salesorderheader] Script Date: 7/28/2023 10:58:57 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | create view [dbo].[vw_salesorderheader] 13 | as 14 | SELECT [SalesOrderID] 15 | ,[RevisionNumber] 16 | ,[OrderDate] 17 | ,[DueDate] 18 | ,[ShipDate] 19 | ,[Status] 20 | ,case when [OnlineOrderFlag] = 1 21 | then 'Y' 22 | else 'N' 23 | end as [OnlineOrderFlag] 24 | ,[SalesOrderNumber] 25 | ,cast([PurchaseOrderNumber] as varchar(25)) as [PurchaseOrderNumber] 26 | ,cast([AccountNumber] as varchar(15)) as [AccountNumber] 27 | ,[CustomerID] 28 | ,[SalesPersonID] 29 | ,[TerritoryID] 30 | ,[BillToAddressID] 31 | ,[ShipToAddressID] 32 | ,[ShipMethodID] 33 | ,[CreditCardID] 34 | ,[CreditCardApprovalCode] 35 | ,[CurrencyRateID] 36 | ,[SubTotal] 37 | ,[TaxAmt] 38 | ,[Freight] 39 | ,[TotalDue] 40 | ,[Comment] 41 | ,[rowguid] 42 | ,[ModifiedDate] 43 | FROM [AdventureWorks2019].[Sales].[SalesOrderHeader] 44 | GO 45 | -------------------------------------------------------------------------------- /sql/vw_salesreason.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_salesreason] Script Date: 6/3/2023 10:49:23 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | 13 | Create View [dbo].[vw_salesreason] 14 | AS 15 | SELECT [SalesReasonID] 16 | ,cast([Name] as varchar(50)) AS SalesReason 17 | ,[ReasonType] 18 | ,[ModifiedDate] 19 | FROM [AdventureWorks2019].[Sales].[SalesReason] 20 | GO 21 | 22 | 23 | -------------------------------------------------------------------------------- /sql/vw_salesterritory.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_salesterritory] Script Date: 6/3/2023 10:49:38 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | Create View [dbo].[vw_salesterritory] 13 | AS 14 | SELECT [TerritoryID] 15 | ,cast([Name] as varchar(50)) AS SalesTerritoryName 16 | ,[CountryRegionCode] 17 | ,[Group] 18 | ,[SalesYTD] 19 | ,[SalesLastYear] 20 | ,[CostYTD] 21 | ,[CostLastYear] 22 | ,[rowguid] 23 | ,[ModifiedDate] 24 | FROM [AdventureWorks2019].[Sales].[SalesTerritory] 25 | GO 26 | 27 | 28 | -------------------------------------------------------------------------------- /sql/vw_stateprovince.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_stateprovince] Script Date: 6/3/2023 10:49:56 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | /****** Script for SelectTopNRows command from SSMS ******/ 12 | Create View [dbo].[vw_stateprovince] 13 | AS 14 | 15 | SELECT [StateProvinceID] 16 | ,[StateProvinceCode] 17 | ,[CountryRegionCode] 18 | ,[IsOnlyStateProvinceFlag] 19 | ,cast([Name] as varchar(50)) AS StatProvinceName 20 | ,[TerritoryID] 21 | ,[rowguid] 22 | ,[ModifiedDate] 23 | FROM [AdventureWorks2019].[Person].[StateProvince] 24 | GO 25 | 26 | 27 | -------------------------------------------------------------------------------- /sql/vw_store.sql: -------------------------------------------------------------------------------- 1 | USE [AdventureWorks2019] 2 | GO 3 | 4 | /****** Object: View [dbo].[vw_store] Script Date: 6/3/2023 10:50:15 AM ******/ 5 | SET ANSI_NULLS ON 6 | GO 7 | 8 | SET QUOTED_IDENTIFIER ON 9 | GO 10 | 11 | 12 | Create View [dbo].[vw_store] 13 | AS 14 | 15 | SELECT [BusinessEntityID] 16 | ,cast([Name] as varchar(50)) AS StoreName 17 | ,[SalesPersonID] 18 | ,[Demographics] 19 | ,[rowguid] 20 | ,[ModifiedDate] 21 | FROM [AdventureWorks2019].[Sales].[Store] 22 | GO 23 | 24 | 25 | -------------------------------------------------------------------------------- /star-schema-example1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/star-schema-example1.png -------------------------------------------------------------------------------- /warehouse/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /warehouse/.user.yml: -------------------------------------------------------------------------------- 1 | id: 99c83757-36ac-4835-8c6a-5e5ddef48a2d 2 | -------------------------------------------------------------------------------- /warehouse/.vscode/settings.json: -------------------------------------------------------------------------------- 1 | { 2 | "sqltools.connections": [ 3 | { 4 | "previewLimit": 50, 5 | "server": "localhost", 6 | "port": 5432, 7 | "driver": "PostgreSQL", 8 | "name": "postgresAW", 9 | "database": "adventureworks", 10 | "username": "etl", 11 | "password": "demopass" 12 | } 13 | ] 14 | } -------------------------------------------------------------------------------- /warehouse/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt deps 7 | - dbt run 8 | - dbt test 9 | 10 | 11 | ### Resources: 12 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 13 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 14 | - Join the [chat](https://community.getdbt.com/) on Slack for live discussions and support 15 | - Find [dbt events](https://events.getdbt.com) near you 16 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 17 | -------------------------------------------------------------------------------- /warehouse/analyses/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/analyses/.gitkeep -------------------------------------------------------------------------------- /warehouse/analyses/create table product.sql: -------------------------------------------------------------------------------- 1 | SELECT 2 | productid as product_id, 3 | productname as name, 4 | productnumber as product_number, 5 | safetystocklevel as safety_stock_level, 6 | reorderpoint as reorder_point, 7 | standardcost as standard_cost, 8 | listprice as list_price, 9 | daystomanufacture as days_to_manufacture, 10 | modifieddate as modified_date 11 | --into staging.product 12 | FROM staging.stg_product 13 | limit 10; 14 | 15 | -------------------------------------------------------------------------------- /warehouse/analyses/update_modified_date_on_product_task.sql: -------------------------------------------------------------------------------- 1 | CREATE FUNCTION update_modified_date_on_product_task() 2 | RETURNS TRIGGER AS $$ 3 | BEGIN 4 | NEW.modified_date = now(); 5 | RETURN NEW; 6 | END; 7 | $$ language 'plpgsql'; -------------------------------------------------------------------------------- /warehouse/analyses/update_product_table_script.sql: -------------------------------------------------------------------------------- 1 | update staging.product 2 | set name = 'BE Ball Bearing' 3 | where product_id = 3 -------------------------------------------------------------------------------- /warehouse/analyses/update_product_task_on.sql: -------------------------------------------------------------------------------- 1 | CREATE TRIGGER update_product_task_on 2 | BEFORE UPDATE 3 | ON 4 | staging.product 5 | FOR EACH ROW 6 | EXECUTE PROCEDURE update_modified_date_on_product_task(); -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__init__.py -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/airbyte.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/airbyte.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/assets.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/assets.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/constants.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/constants.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/dbt.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/dbt.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/definitions.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/definitions.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/schedules.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/datawarehouse_dagster/datawarehouse_dagster/__pycache__/schedules.cpython-39.pyc -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/airbyte.py: -------------------------------------------------------------------------------- 1 | 2 | from dagster_airbyte import airbyte_resource, load_assets_from_airbyte_instance 3 | 4 | from .constants import AIRBYTE_CONFIG 5 | 6 | airbyte_instance = airbyte_resource.configured(AIRBYTE_CONFIG) 7 | 8 | airbyte_assets = load_assets_from_airbyte_instance( airbyte_instance, key_prefix=["src_postgres"]) -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/constants.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | 4 | from dagster_dbt import DbtCliResource 5 | from dagster._utils import file_relative_path 6 | 7 | dbt_project_dir = Path(__file__).joinpath("..", "..", "..").resolve() 8 | dbt = DbtCliResource(project_dir=os.fspath(dbt_project_dir)) 9 | 10 | # If DAGSTER_DBT_PARSE_PROJECT_ON_LOAD is set, a manifest will be created at runtime. 11 | # Otherwise, we expect a manifest to be present in the project's target directory. 12 | if os.getenv("DAGSTER_DBT_PARSE_PROJECT_ON_LOAD"): 13 | dbt_parse_invocation = dbt.cli(["parse"], manifest={}).wait() 14 | dbt_manifest_path = dbt_parse_invocation.target_path.joinpath("manifest.json") 15 | else: 16 | dbt_manifest_path = dbt_project_dir.joinpath("target", "manifest.json") 17 | 18 | # Airbyte configs 19 | AIRBYTE_CONNECTION_ID = os.environ.get("AIRBYTE_CONNECTION_ID", "dfe6eae2-7fd3-4114-8ffc-5c78d6757446") 20 | 21 | 22 | AIRBYTE_CONFIG = { 23 | "host": os.environ.get("AIRBYTE_HOST", "localhost"), 24 | "port": os.environ.get("AIRBYTE_PORT", "8000"), 25 | "username": "airbyte", 26 | "password": "password", 27 | } 28 | 29 | # 30 | DBT_PROJECT_DIR = file_relative_path(__file__, "../../dbt_project") 31 | DBT_PROFILES_DIR = file_relative_path(__file__, "../../dbt_project") 32 | DBT_CONFIG = {"project_dir": DBT_PROJECT_DIR, "profiles_dir": DBT_PROFILES_DIR} -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/dbt.py: -------------------------------------------------------------------------------- 1 | from dagster import OpExecutionContext 2 | from dagster_dbt import DbtCliResource, dbt_assets 3 | 4 | from .constants import dbt_manifest_path 5 | 6 | 7 | @dbt_assets(manifest=dbt_manifest_path) 8 | def warehouse_dbt_assets(context: OpExecutionContext, dbt: DbtCliResource): 9 | yield from dbt.cli(["build"], context=context).stream() -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/definitions.py: -------------------------------------------------------------------------------- 1 | import os 2 | from dagster import Definitions,ScheduleDefinition, define_asset_job 3 | from dagster_dbt import DbtCliResource 4 | 5 | from .dbt import warehouse_dbt_assets 6 | from .airbyte import airbyte_assets 7 | from .constants import dbt_project_dir 8 | from .schedules import schedules 9 | 10 | defs = Definitions( 11 | assets=[warehouse_dbt_assets, airbyte_assets], 12 | resources={ 13 | "dbt": DbtCliResource(project_dir=os.fspath(dbt_project_dir)), 14 | }, 15 | schedules=[ 16 | # update all assets once a day 17 | ScheduleDefinition( 18 | job=define_asset_job("all_assets", selection="*"), cron_schedule="@daily" 19 | ), 20 | ], 21 | ) -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/datawarehouse_dagster/schedules.py: -------------------------------------------------------------------------------- 1 | """ 2 | To add a daily schedule that materializes your dbt assets, uncomment the following lines. 3 | """ 4 | from dagster_dbt import build_schedule_from_dbt_selection 5 | 6 | from .dbt import warehouse_dbt_assets 7 | 8 | schedules = [ 9 | # build_schedule_from_dbt_selection( 10 | # [warehouse_dbt_assets], 11 | # job_name="materialize_dbt_models", 12 | # cron_schedule="0 0 * * *", 13 | # dbt_select="fqn:*", 14 | # ), 15 | ] -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [tool.dagster] 6 | module_name = "datawarehouse_dagster.definitions" 7 | code_location_name = "datawarehouse_dagster" -------------------------------------------------------------------------------- /warehouse/datawarehouse_dagster/setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import find_packages, setup 2 | 3 | setup( 4 | name="datawarehouse_dagster", 5 | version="0.0.1", 6 | packages=find_packages(), 7 | install_requires=[ 8 | "dagster", 9 | "dagster-cloud", 10 | "dagster-dbt", 11 | "dbt-core>=1.4.0", 12 | "dbt-postgres","dbt-postgres", 13 | ], 14 | extras_require={ 15 | "dev": [ 16 | "dagster-webserver", 17 | ] 18 | }, 19 | ) -------------------------------------------------------------------------------- /warehouse/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'warehouse' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'warehouse' 11 | 12 | vars: 13 | 'dbt_date:time_zone': 'America/New_York' 14 | 15 | # These configurations specify where dbt should look for different types of files. 16 | # The `model-paths` config, for example, states that models in this project can be 17 | # found in the "models/" directory. You probably won't need to change these! 18 | model-paths: ["models"] 19 | analysis-paths: ["analyses"] 20 | test-paths: ["tests"] 21 | seed-paths: ["seeds"] 22 | macro-paths: ["macros"] 23 | snapshot-paths: ["snapshots"] 24 | 25 | target-path: "target" # directory which will store compiled SQL files 26 | clean-targets: # directories to be removed by `dbt clean` 27 | - "target" 28 | - "dbt_packages" 29 | 30 | 31 | # Configuring models 32 | # Full documentation: https://docs.getdbt.com/docs/configuring-models 33 | 34 | # In this example config, we tell dbt to build all models in the example/ 35 | # directory as views. These settings can be overridden in the individual model 36 | # files using the `{{ config(...) }}` macro. 37 | models: 38 | warehouse: 39 | # Config indicated by + and applies to all files under models/example/ 40 | staging: 41 | +materialized: table 42 | +schema: staging 43 | mart: 44 | +materialized: table 45 | +schema: sales 46 | -------------------------------------------------------------------------------- /warehouse/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/macros/.gitkeep -------------------------------------------------------------------------------- /warehouse/macros/generate_schema_name.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_schema_name(custom_schema_name, node) -%} 2 | 3 | {%- set default_schema = target.schema -%} 4 | {%- if custom_schema_name is none -%} 5 | 6 | {{ default_schema }} 7 | 8 | {%- else -%} 9 | 10 | {{ custom_schema_name | trim }} 11 | 12 | {%- endif -%} 13 | 14 | {%- endmacro %} -------------------------------------------------------------------------------- /warehouse/models/mart/dimAddress.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_address.addressid']) }} as address_key, 3 | stg_address.addressid, 4 | stg_address.city as city_name, 5 | stg_address.postalcode, 6 | stg_address.addressline1 || ' '|| coalesce(stg_address.addressline2, '') as Addressline, 7 | stg_stateprovince.statprovincename as state_name, 8 | stg_countryregion.countryregionname as country_name 9 | from {{ ref('stg_address') }} 10 | left join {{ ref('stg_stateprovince') }} on stg_address.stateprovinceid = stg_stateprovince.stateprovinceid 11 | left join {{ ref('stg_countryregion') }} on stg_stateprovince.countryregioncode = stg_countryregion.countryregioncode -------------------------------------------------------------------------------- /warehouse/models/mart/dimCustomer.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_customer.customerid']) }} as customer_key, 3 | stg_customer.customerid, 4 | stg_person.businessentityid as personbusinessentityid, 5 | stg_person.title, 6 | stg_person.firstname || ' '|| lastname as fullname, 7 | stg_person.houseownerflag, 8 | stg_person.occupation, 9 | stg_person.maritalstatus, 10 | stg_person.commutedistance, 11 | stg_person.education, 12 | --stg_person.gender, 13 | stg_person.numbercarsowned, 14 | stg_person.totalchildren, 15 | stg_person.birthdate, 16 | stg_person.datefirstpurchase, 17 | stg_countryregion.countryregionname as country, 18 | stg_address.city, 19 | stg_stateprovince.statprovincename as state, 20 | stg_address.postalcode, 21 | stg_address.addressline1, 22 | stg_address.addressline2 23 | from {{ ref('stg_customer') }} 24 | left join {{ ref('stg_person') }} on stg_customer.personid = stg_person.businessentityid 25 | left join {{ ref('stg_entityaddress') }} on stg_entityaddress.businessentityid = stg_person.businessentityid 26 | left join {{ ref('stg_address') }} on stg_address.addressid = stg_entityaddress.addressid 27 | left join {{ ref('stg_stateprovince') }} on stg_stateprovince.stateprovinceid = stg_address.stateprovinceid 28 | left join {{ ref('stg_countryregion') }} on stg_countryregion.countryregioncode = stg_stateprovince.countryregioncode 29 | where persontype = 'IN' 30 | and addresstypeid = 2 -------------------------------------------------------------------------------- /warehouse/models/mart/dimDate.sql: -------------------------------------------------------------------------------- 1 | with date_dimension as ( 2 | select * from {{ ref('stg_date') }} 3 | ), 4 | full_dt as ( 5 | {{ dbt_date.get_base_dates(start_date="2011-01-01", end_date="2014-12-31") }} 6 | ), 7 | full_dt_tr as ( 8 | select 9 | d.*, 10 | f.date_day as fulldt, 11 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", "UTC") }} as dulldtz, 12 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York", source_tz="UTC") }} as dulldtzt, 13 | {{ dbt_date.convert_timezone("f.date_day", "America/New_York") }} as test, 14 | --f.date_day AT TIME ZONE 'PST' AS "direct_pst", 15 | f.date_day::timestamp "direct_dts", 16 | f.date_day::timestamp AT TIME ZONE 'UTC' AS "ts_utc" 17 | from 18 | date_dimension d 19 | left join full_dt f on d.date_day = cast(f.date_day as date) 20 | ) 21 | select 22 | {{ dbt_utils.generate_surrogate_key(['direct_dts']) }} as date_key, 23 | * 24 | From full_dt_tr -------------------------------------------------------------------------------- /warehouse/models/mart/dimOrderStatus.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_salesorderheader.status']) }} as order_status_key, 3 | status as order_status, 4 | case 5 | when status = 1 then 'in_process' 6 | when status = 2 then 'approved' 7 | when status = 3 then 'backordered' 8 | when status = 4 then 'rejected' 9 | when status = 5 then 'shipped' 10 | when status = 6 then 'cancelled' 11 | else 'no_status' 12 | end as order_status_name 13 | from {{ ref('stg_salesorderheader') }} -------------------------------------------------------------------------------- /warehouse/models/mart/dimProduct.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_product.productid']) }} as product_key, 3 | stg_product.productid, 4 | stg_product.productname as product_name, 5 | stg_product.productnumber, 6 | stg_product.color, 7 | stg_product.daystomanufacture, 8 | stg_product.safetystocklevel, 9 | stg_product.standardcost, 10 | stg_productsubcategory.productsubcategory as product_subcategory_name, 11 | stg_productcategory.productcategory as product_category_name, 12 | stg_product.sellstartdate, 13 | stg_product.sellenddate 14 | from {{ ref('stg_product') }} 15 | left join {{ ref('stg_productsubcategory') }} on stg_product.productsubcategoryid = stg_productsubcategory.productsubcategoryid 16 | left join {{ ref('stg_productcategory') }} on stg_productsubcategory.productcategoryid = stg_productcategory.productcategoryid -------------------------------------------------------------------------------- /warehouse/models/mart/dimTerritory.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_salesterritory.territoryid']) }} as territory_key, 3 | territoryid, 4 | salesterritoryname, 5 | "Group" as territory_group, 6 | countryregioncode, 7 | costytd, 8 | salesytd, 9 | costlastyear, 10 | saleslastyear, 11 | modifieddate 12 | from {{ ref('stg_salesterritory') }} -------------------------------------------------------------------------------- /warehouse/models/mart/fctSales.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ dbt_utils.generate_surrogate_key(['stg_salesorderdetail.salesorderid', 'salesorderdetailid']) }} as sales_key, 3 | {{ dbt_utils.generate_surrogate_key(['productid']) }} as product_key, 4 | {{ dbt_utils.generate_surrogate_key(['customerid']) }} as customer_key, 5 | {{ dbt_utils.generate_surrogate_key(['creditcardid']) }} as creditcard_key, 6 | {{ dbt_utils.generate_surrogate_key(['shiptoaddressid']) }} as ship_address_key, 7 | {{ dbt_utils.generate_surrogate_key(['status']) }} as order_status_key, 8 | {{ dbt_utils.generate_surrogate_key(['orderdate']) }} as order_date_key, 9 | {{ dbt_utils.generate_surrogate_key(['shipdate']) }} as ship_date_key, 10 | {{ dbt_utils.generate_surrogate_key(['duedate']) }} as due_date_key, 11 | {{ dbt_utils.generate_surrogate_key(['territoryid']) }} as territory_key, 12 | {{ dbt_utils.ge}} 13 | orderdate, 14 | onlineorderflag, 15 | stg_salesorderdetail.unitpricediscount as unitpricediscount, 16 | stg_salesorderheader.salesordernumber, 17 | stg_salesorderdetail.salesorderid, 18 | stg_salesorderdetail.salesorderdetailid, 19 | stg_salesorderdetail.unitprice, 20 | stg_salesorderdetail.orderqty, 21 | stg_salesorderdetail.linetotal as revenue, 22 | stg_salesorderdetail.linetotal as salesamount, 23 | case when stg_salesorderdetail.unitpricediscount > 0 24 | then stg_salesorderdetail.linetotal * stg_salesorderdetail.unitpricediscount 25 | else stg_salesorderdetail.linetotal 26 | end as totaldiscount, 27 | stg_salesorderheader.taxamt 28 | from {{ ref('stg_salesorderdetail') }} 29 | inner join {{ ref('stg_salesorderheader') }} on stg_salesorderdetail.salesorderid = stg_salesorderheader.salesorderid -------------------------------------------------------------------------------- /warehouse/models/mart/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: dimaddress 5 | description: contains address information for the customers. 6 | 7 | - name: dimcustomer 8 | description: contains customers details for company. 9 | 10 | - name: dimorderstatus 11 | description: contains order status details. 12 | 13 | - name: dimproduct 14 | description: contains product information and product hierarchy. 15 | 16 | - name: dimterritory 17 | description: contains sales territory details 18 | 19 | - name: dimdate 20 | description: date diemension for datatime analysis 21 | 22 | - name: fctsales 23 | description: contains sales transaction across dims included in this data model. -------------------------------------------------------------------------------- /warehouse/models/staging/schema.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | 6 | - name: stg_address 7 | description: "" 8 | columns: 9 | - name: city 10 | description: "City name" 11 | - name: addressid 12 | tests: 13 | - unique 14 | - not_null 15 | - name: postalcode 16 | description: "Zip Code" 17 | - name: addressline1 18 | description: "Address First Line" 19 | - name: addressline2 20 | description: "Address Second Line" 21 | - name: modifieddate 22 | description: "Address modified date" 23 | - name: spatiallocation 24 | description: "" 25 | - name: stateprovinceid 26 | description: "Foreign key for stateprovince" 27 | tests: 28 | - relationships: 29 | to: ref('stg_stateprovince') 30 | field: stateprovinceid 31 | 32 | 33 | 34 | - name: stg_customer 35 | columns: 36 | - name: storeid 37 | description: "" 38 | - name: rowguid 39 | description: "" 40 | - name: personid 41 | description: "" 42 | - name: customerid 43 | description: "" 44 | description: "The primary key for this table" 45 | tests: 46 | - unique 47 | - not_null 48 | - name: territoryid 49 | description: "" 50 | - name: modifieddate 51 | description: "" 52 | - name: accountnumber 53 | description: "" 54 | 55 | 56 | - name: stg_salesorderdetail 57 | columns: 58 | - name: orderqty 59 | description: ' ' 60 | - name: linetotal 61 | description: ' ' 62 | - name: productid 63 | description: ' ' 64 | - name: unitprice 65 | description: ' ' 66 | - name: modifieddate 67 | description: ' ' 68 | - name: salesorderid 69 | description: "The column is part of the primary key for this table" 70 | tests: 71 | - not_null 72 | description: ' ' 73 | - name: specialofferid 74 | description: ' ' 75 | - name: unitpricediscount 76 | description: ' ' 77 | - name: salesorderdetailid 78 | description: "The column is part of the primary key for this table" 79 | tests: 80 | - not_null 81 | - name: carriertrackingnumber 82 | description: ' ' 83 | 84 | 85 | - name: stg_salesorderheadersalesreason 86 | columns: 87 | - name: modifieddate 88 | description: ' ' 89 | - name: salesorderid 90 | description: "The column is part of the primary key for this table" 91 | tests: 92 | - not_null 93 | - name: salesreasonid 94 | description: "The column is part of the primary key for this table" 95 | tests: 96 | - not_null 97 | 98 | 99 | - name: stg_salesorderheader 100 | columns: 101 | - name: status 102 | description: ' ' 103 | - name: taxamt 104 | description: ' ' 105 | - name: comment 106 | description: ' ' 107 | - name: duedate 108 | description: ' ' 109 | - name: freight 110 | description: ' ' 111 | - name: rowguid 112 | description: ' ' 113 | - name: shipdate 114 | description: ' ' 115 | - name: subtotal 116 | description: ' ' 117 | - name: totaldue 118 | description: ' ' 119 | - name: orderdate 120 | description: ' ' 121 | - name: customerid 122 | description: ' ' 123 | - name: territoryid 124 | description: ' ' 125 | - name: creditcardid 126 | description: ' ' 127 | - name: modifieddate 128 | description: ' ' 129 | - name: salesorderid 130 | description: ' ' 131 | description: "The primary key for this table" 132 | tests: 133 | - unique 134 | - not_null 135 | - name: shipmethodid 136 | description: ' ' 137 | - name: salespersonid 138 | description: ' ' 139 | - name: currencyrateid 140 | description: ' ' 141 | - name: revisionnumber 142 | description: ' ' 143 | - name: billtoaddressid 144 | description: ' ' 145 | - name: shiptoaddressid 146 | description: ' ' 147 | - name: salesordernumber 148 | description: ' ' 149 | - name: creditcardapprovalcode 150 | description: ' ' 151 | 152 | 153 | - name: stg_countryregion 154 | columns: 155 | - name: modifieddate 156 | description: ' ' 157 | - name: countryregioncode 158 | description: ' ' 159 | description: "The primary key for this table" 160 | tests: 161 | - unique 162 | - not_null 163 | - name: countryregionname 164 | description: ' ' 165 | 166 | 167 | - name: stg_product 168 | columns: 169 | - name: size 170 | description: ' ' 171 | - name: class 172 | description: ' ' 173 | - name: color 174 | description: ' ' 175 | - name: style 176 | description: ' ' 177 | - name: weight 178 | description: ' ' 179 | - name: rowguid 180 | description: ' ' 181 | - name: listprice 182 | description: ' ' 183 | - name: productid 184 | description: ' ' 185 | description: "The primary key for this table" 186 | tests: 187 | - unique 188 | - not_null 189 | - name: productline 190 | description: ' ' 191 | - name: productname 192 | description: ' ' 193 | - name: sellenddate 194 | description: ' ' 195 | - name: modifieddate 196 | description: ' ' 197 | - name: reorderpoint 198 | description: ' ' 199 | - name: standardcost 200 | description: ' ' 201 | - name: productnumber 202 | description: ' ' 203 | - name: sellstartdate 204 | description: ' ' 205 | - name: productmodelid 206 | description: ' ' 207 | - name: discontinueddate 208 | description: ' ' 209 | - name: safetystocklevel 210 | description: ' ' 211 | - name: daystomanufacture 212 | description: ' ' 213 | - name: sizeunitmeasurecode 214 | description: ' ' 215 | - name: productsubcategoryid 216 | description: ' ' 217 | - name: weightunitmeasurecode 218 | description: ' ' 219 | 220 | 221 | - name: stg_person 222 | columns: 223 | - name: title 224 | description: ' ' 225 | - name: suffix 226 | description: ' ' 227 | - name: rowguid 228 | description: ' ' 229 | - name: lastname 230 | description: ' ' 231 | - name: firstname 232 | description: ' ' 233 | - name: middlename 234 | description: ' ' 235 | - name: persontype 236 | description: ' ' 237 | - name: demographics 238 | description: ' ' 239 | - name: modifieddate 240 | description: ' ' 241 | - name: emailpromotion 242 | description: ' ' 243 | - name: businessentityid 244 | description: ' ' 245 | description: "The primary key for this table" 246 | tests: 247 | - unique 248 | - not_null 249 | - name: additionalcontactinfo 250 | description: ' ' 251 | 252 | 253 | - name: stg_store 254 | columns: 255 | - name: rowguid 256 | description: ' ' 257 | - name: demographics 258 | description: ' ' 259 | - name: modifieddate 260 | description: ' ' 261 | - name: salespersonid 262 | description: ' ' 263 | - name: businessentityid 264 | description: "The primary key for this table" 265 | tests: 266 | - unique 267 | - not_null 268 | - name: statprovincename 269 | description: ' ' 270 | 271 | 272 | - name: stg_salesreason 273 | columns: 274 | - name: salesreason 275 | description: ' ' 276 | - name: modifieddate 277 | description: ' ' 278 | - name: salesreasonid 279 | description: ' ' 280 | description: "The primary key for this table" 281 | tests: 282 | - unique 283 | - not_null 284 | 285 | 286 | - name: stg_salesterritory 287 | columns: 288 | - name: group 289 | description: ' ' 290 | - name: costytd 291 | description: ' ' 292 | - name: rowguid 293 | description: ' ' 294 | - name: salesytd 295 | description: ' ' 296 | - name: territoryid 297 | description: ' ' 298 | description: "The primary key for this table" 299 | tests: 300 | - unique 301 | - not_null 302 | - name: costlastyear 303 | description: ' ' 304 | - name: modifieddate 305 | description: ' ' 306 | - name: saleslastyear 307 | description: ' ' 308 | - name: countryregioncode 309 | description: ' ' 310 | - name: salesterritoryname 311 | description: ' ' 312 | 313 | 314 | - name: stg_productsubcategory 315 | columns: 316 | - name: rowguid 317 | description: ' ' 318 | - name: modifieddate 319 | description: ' ' 320 | - name: productcategoryid 321 | description: "" 322 | - name: productsubcategory 323 | description: ' ' 324 | - name: productsubcategoryid 325 | description: "The primary key for this table" 326 | tests: 327 | - unique 328 | - not_null 329 | 330 | 331 | - name: stg_stateprovince 332 | columns: 333 | - name: rowguid 334 | description: ' ' 335 | - name: territoryid 336 | description: ' ' 337 | - name: modifieddate 338 | description: ' ' 339 | - name: stateprovinceid 340 | description: "The primary key for this table" 341 | tests: 342 | - unique 343 | - not_null 344 | - name: statprovincename 345 | description: ' ' 346 | - name: countryregioncode 347 | description: ' ' 348 | - name: stateprovincecode 349 | description: ' ' 350 | 351 | - name: stg_productcategory 352 | description: "This table stores product category data" 353 | columns: 354 | - name: productcategoryid 355 | description: "The primary key for this table" 356 | tests: 357 | - unique 358 | - not_null 359 | - name: productcategory 360 | description: "Productcategory label" 361 | tests: 362 | - accepted_values: 363 | values: ['Bikes', 'Components', 'Clothing', 'Accessories'] 364 | -------------------------------------------------------------------------------- /warehouse/models/staging/source.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: src_postgres 5 | schema: source 6 | database: adventureworks 7 | 8 | tables: 9 | - name: vw_address 10 | identifier: address 11 | - name: Customer 12 | identifier: customer 13 | - name: SalesOrderDetail 14 | identifier: salesorderdetail 15 | - name: salesorderheader 16 | - name: salesorderheadersalesreason 17 | - name: vw_countryregion 18 | - name: vw_person 19 | - name: vw_product 20 | - name: vw_productcategory 21 | - name: vw_productsubcategory 22 | - name: vw_salesreason 23 | - name: vw_salesterritory 24 | - name: vw_stateprovince 25 | - name: vw_store 26 | - name: vw_salesorderheader 27 | - name: BusinessEntityAddress 28 | identifier: businessentityaddress 29 | -------------------------------------------------------------------------------- /warehouse/models/staging/stg_address.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_address') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | city, 9 | rowguid, 10 | addressid, 11 | postalcode, 12 | addressline1, 13 | addressline2, 14 | modifieddate, 15 | spatiallocation, 16 | stateprovinceid 17 | from source 18 | ) 19 | 20 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_countryregion.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_countryregion') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | modifieddate, 9 | countryregioncode, 10 | countryregionname 11 | from source 12 | ) 13 | 14 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_customer.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'Customer') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | storeid, 9 | rowguid, 10 | personid, 11 | customerid, 12 | territoryid, 13 | modifieddate, 14 | accountnumber 15 | from source 16 | ) 17 | 18 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_date.sql: -------------------------------------------------------------------------------- 1 | with date_dim as ( 2 | 3 | 4 | {{ dbt_date.get_date_dimension("2011-01-01", "2014-12-31") }} 5 | 6 | ) 7 | select 8 | * 9 | from date_dim -------------------------------------------------------------------------------- /warehouse/models/staging/stg_entityaddress.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'BusinessEntityAddress') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | addresstypeid, 9 | businessentityid, 10 | modifieddate, 11 | addressid 12 | from source 13 | ) 14 | 15 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_person.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_person') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | title, 9 | suffix, 10 | rowguid, 11 | lastname, 12 | firstname, 13 | middlename, 14 | persontype, 15 | modifieddate, 16 | emailpromotion, 17 | businessentityid, 18 | houseownerflag, 19 | occupation, 20 | maritalstatus, 21 | commutedistance, 22 | education, 23 | numbercarsowned, 24 | totalchildren, 25 | birthdate, 26 | datefirstpurchase 27 | from source 28 | ) 29 | 30 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_product.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_product') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | "Size", 9 | "Class", 10 | color, 11 | "Style", 12 | weight, 13 | rowguid, 14 | listprice, 15 | productid, 16 | productline, 17 | productname, 18 | sellenddate AS sellenddate, 19 | modifieddate, 20 | reorderpoint, 21 | standardcost, 22 | productnumber, 23 | sellstartdate AS sellstartdate, 24 | productmodelid, 25 | discontinueddate, 26 | safetystocklevel, 27 | daystomanufacture, 28 | sizeunitmeasurecode, 29 | productsubcategoryid 30 | from source 31 | ) 32 | 33 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_productcategory.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_productcategory') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | modifieddate, 10 | productcategory, 11 | productcategoryid 12 | from source 13 | ) 14 | 15 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_productsubcategory.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_productsubcategory') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | modifieddate, 10 | productcategoryid, 11 | productsubcategory, 12 | productsubcategoryid 13 | from source 14 | ) 15 | 16 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_salesorderdetail.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'SalesOrderDetail') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | orderqty, 10 | linetotal, 11 | productid, 12 | unitprice, 13 | modifieddate, 14 | salesorderid, 15 | specialofferid, 16 | unitpricediscount, 17 | salesorderdetailid, 18 | carriertrackingnumber 19 | from source 20 | ) 21 | 22 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_salesorderheader.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_salesorderheader') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | status, 9 | onlineorderflag, 10 | taxamt, 11 | purchaseordernumber, 12 | "Comment", --First letter is uppercase therefore in double quoutes 13 | duedate::timestamp AS duedate, 14 | freight, 15 | rowguid, 16 | shipdate::timestamp AS shipdate, 17 | subtotal, 18 | totaldue, 19 | orderdate::timestamp AS orderdate, 20 | customerid, 21 | territoryid, 22 | creditcardid, 23 | modifieddate, 24 | salesorderid, 25 | shipmethodid, 26 | salespersonid, 27 | currencyrateid, 28 | revisionnumber, 29 | billtoaddressid, 30 | shiptoaddressid, 31 | salesordernumber, 32 | creditcardapprovalcode 33 | from source 34 | ) 35 | 36 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_salesterritory.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_salesterritory') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | "Group", 9 | costytd, 10 | rowguid, 11 | salesytd, 12 | territoryid, 13 | costlastyear, 14 | modifieddate, 15 | saleslastyear, 16 | countryregioncode, 17 | salesterritoryname 18 | from source 19 | ) 20 | 21 | select * from renamed -------------------------------------------------------------------------------- /warehouse/models/staging/stg_stateprovince.sql: -------------------------------------------------------------------------------- 1 | with source as ( 2 | 3 | select * from {{ source('src_postgres', 'vw_stateprovince') }} 4 | ), 5 | renamed as ( 6 | 7 | select 8 | rowguid, 9 | territoryid, 10 | modifieddate, 11 | stateprovinceid, 12 | statprovincename, 13 | countryregioncode, 14 | stateprovincecode 15 | from source 16 | ) 17 | 18 | select * from renamed -------------------------------------------------------------------------------- /warehouse/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 1.1.1 4 | 5 | - package: dbt-labs/codegen 6 | version: 0.9.0 7 | 8 | - package: calogica/dbt_date 9 | version: 0.10.0 10 | -------------------------------------------------------------------------------- /warehouse/profiles.yml: -------------------------------------------------------------------------------- 1 | warehouse: 2 | outputs: 3 | 4 | dev: 5 | type: postgres 6 | threads: 4 7 | host: localhost 8 | port: 5432 9 | user: "{{ env_var('PGUID') }}" 10 | pass: "{{ env_var('PGPASS') }}" 11 | dbname: adventureworks 12 | schema: public 13 | 14 | prod: 15 | type: postgres 16 | threads: 4 17 | host: localhost 18 | port: 5432 19 | user: "{{ env_var('PGUID') }}" 20 | pass: "{{ env_var('PGPASS') }}" 21 | dbname: adventureworks 22 | schema: public 23 | 24 | target: dev 25 | 26 | -------------------------------------------------------------------------------- /warehouse/schema.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /warehouse/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/seeds/.gitkeep -------------------------------------------------------------------------------- /warehouse/seeds/countryisocodes.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/seeds/countryisocodes.csv -------------------------------------------------------------------------------- /warehouse/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/snapshots/.gitkeep -------------------------------------------------------------------------------- /warehouse/snapshots/product_snapshot.sql: -------------------------------------------------------------------------------- 1 | {% snapshot product_snapshot %} 2 | 3 | {{ 4 | config( 5 | target_schema='snapshots', 6 | unique_key='product_id', 7 | strategy='timestamp', 8 | updated_at='modified_date', 9 | ) 10 | }} 11 | 12 | select * from {{ source('adventureworks', 'product') }} 13 | 14 | {% endsnapshot %} -------------------------------------------------------------------------------- /warehouse/snapshots/source.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: adventureworks 5 | schema: staging 6 | database: adventureworks 7 | 8 | tables: 9 | - name: product -------------------------------------------------------------------------------- /warehouse/source.py: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /warehouse/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/hnawaz007/dbt-dw/2181198a81e021c42940daef1c24ad3c4d8c61fa/warehouse/tests/.gitkeep --------------------------------------------------------------------------------