├── data ├── .gitkeep └── .DS_Store ├── analysis └── .gitkeep ├── macros ├── .gitkeep ├── db │ ├── postgres │ │ ├── escape.sql │ │ ├── quote_string.sql │ │ └── postgres_type_db.sql │ ├── redshift │ │ ├── escape.sql │ │ └── quote_string.sql │ ├── bigquery │ │ ├── quote_string.sql │ │ ├── quote_column_name.sql │ │ └── split_and_return_nth_value.sql │ ├── core │ │ ├── escape.sql │ │ ├── identifier_mapping.sql │ │ ├── quote_column_name.sql │ │ ├── quote_string.sql │ │ └── split_and_return_nth_value.sql │ └── snowflake │ │ └── identifier_mapping.sql ├── public │ ├── cleaning │ │ ├── clean_capitalize_words.sql │ │ ├── clean_additional_whitespace.sql │ │ └── clean_blacklist.sql │ ├── filtering │ │ ├── remove_duplicates.sql │ │ └── get_duplicates.sql │ ├── store │ │ ├── export_table_samples.sql │ │ ├── export_tests_history.sql │ │ ├── export_alerts.sql │ │ └── generate_overview.sql │ ├── normalizing │ │ └── normalize_values.sql │ └── validating │ │ ├── valid_with_regex.sql │ │ └── regex_dict.sql ├── utils │ ├── for_loops.sql │ ├── depends_macro.sql │ ├── comma_delimited_list.sql │ ├── comparison_text.sql │ ├── bool_to_string.sql │ ├── is_list.sql │ ├── formulas.sql │ ├── quote.sql │ ├── dict_from_list.sql │ ├── get_database.sql │ ├── in_compile.sql │ ├── regular_expression.sql │ ├── deduplication │ │ └── add_duplication_context.sql │ ├── json │ │ └── to_single_json.sql │ ├── agate │ │ └── row_value.sql │ ├── fivetran_utils │ │ ├── json_extract.sql │ │ └── percentile.sql │ ├── monitored_config.sql │ ├── mock │ │ └── empty_tables.sql │ ├── used_types.sql │ ├── column_types.sql │ ├── generate_alert_message.sql │ └── time_macros.sql ├── meta │ ├── monitored_model_queries.sql │ ├── save_monitored.sql │ ├── information_schema.sql │ ├── table_name.sql │ └── get_monitored.sql ├── metrics │ └── base │ │ ├── build_in │ │ ├── optional_table_metrics.sql │ │ ├── table_default.sql │ │ ├── column_default.sql │ │ └── optional_column_metrics.sql │ │ ├── internal_model_template.sql │ │ ├── queries.sql │ │ └── expression.sql ├── post_hook │ └── re_data_monitored.sql ├── store │ └── insert_list_to_table.sql ├── config │ └── get_model_config.sql ├── samples │ └── internal_model_template.sql ├── run_end │ └── save_results_history.sql └── tests │ └── test_metrics.sql ├── snapshots └── .gitkeep ├── tests └── .gitkeep ├── integration_tests ├── macros │ ├── .gitkeep │ ├── trigger_schema_change.sql │ ├── test_utils.sql │ ├── my_metrics.sql │ ├── create_test_source_tables.sql │ └── drop_all_schemas.sql ├── seeds │ ├── .gitkeep │ ├── public_macros │ │ ├── validating │ │ │ ├── validate_emails.csv │ │ │ ├── expected_validated_emails.csv │ │ │ ├── validate_numbers.csv │ │ │ ├── validate_date_and_time.csv │ │ │ ├── validate_uuid.csv │ │ │ ├── expected_validated_uuids.csv │ │ │ ├── expected_validated_credit_cards.csv │ │ │ ├── expected_validated_numbers.csv │ │ │ ├── expected_validated_date_and_time.csv │ │ │ ├── validate_ip.csv │ │ │ └── expected_validated_ips.csv │ │ ├── filtering │ │ │ ├── duplicated.csv │ │ │ ├── expected_duplicates.csv │ │ │ └── expected_deduplicated.csv │ │ ├── normalizing │ │ │ ├── abbreviated_us_states.csv │ │ │ ├── us_states_normalization.csv │ │ │ └── expected_us_states_normalized.csv │ │ └── cleaning │ │ │ ├── expected_sample_user_data.csv │ │ │ └── sample_user_data.csv │ └── monitoring │ │ ├── expected_table_samples.csv │ │ ├── sample_table.csv │ │ ├── sample_with_anomaly.csv │ │ ├── sample_without_time_filter.csv │ │ ├── expected_test_history.csv │ │ ├── expected_anomalies.csv │ │ └── expected_z_score.csv ├── tests │ └── .gitkeep ├── analysis │ └── .gitkeep ├── snapshots │ └── .gitkeep ├── python_tests │ ├── __init__.py │ ├── utils │ │ ├── __init__.py │ │ └── run.py │ ├── run_all_dbs.sh │ ├── conftest.py │ ├── test_cleaners.py │ ├── test_normalizers.py │ ├── test_validate.py │ ├── test_filters.py │ └── test_monitoring.py ├── requirements.txt ├── README.md ├── .gitignore ├── packages.yml ├── models │ ├── public_macros │ │ ├── filtering │ │ │ ├── duplicates.sql │ │ │ ├── schema.yml │ │ │ └── deduplicated.sql │ │ ├── cleaning │ │ │ ├── schema.yml │ │ │ └── sanitized_user_data.sql │ │ ├── normalizing │ │ │ ├── schema.yml │ │ │ └── us_states_normalized.sql │ │ └── validating │ │ │ ├── validated_emails.sql │ │ │ ├── validated_uuids.sql │ │ │ ├── validated_ips.sql │ │ │ ├── schema.yml │ │ │ ├── validated_numbers.sql │ │ │ └── validated_date_and_time.sql │ ├── monitoring │ │ ├── test_re_data_anomalies.sql │ │ ├── test_re_data_metrics.sql │ │ ├── test_re_data_test_history.sql │ │ ├── test_re_data_table_samples.sql │ │ ├── test_re_data_z_score.sql │ │ └── schema.yml │ ├── transformed │ │ ├── buy_events.sql │ │ └── schema.yml │ ├── sources │ │ └── schema.yml │ └── metrics │ │ └── re_data_metrics.yml ├── pytest.ini └── dbt_project.yml ├── models ├── internal │ ├── samples │ │ ├── re_data_last_table_samples.sql │ │ └── re_data_last_table_samples_part.sql │ ├── metrics │ │ └── base │ │ │ ├── re_data_last_base_metrics_part0.sql │ │ │ ├── re_data_last_base_metrics_part1.sql │ │ │ ├── re_data_last_base_metrics_part2.sql │ │ │ ├── re_data_last_base_metrics_part3.sql │ │ │ ├── re_data_last_base_metrics_thread0.sql │ │ │ ├── re_data_last_base_metrics_thread1.sql │ │ │ ├── re_data_last_base_metrics_thread2.sql │ │ │ └── re_data_last_base_metrics_thread3.sql │ └── re_data_run_started_at.sql ├── metrics │ ├── final │ │ └── re_data_metrics.sql │ ├── for_anomalies │ │ ├── re_data_last_metrics.sql │ │ └── re_data_last_stats.sql │ └── types │ │ ├── samples │ │ └── re_data_table_samples.sql │ │ ├── schema │ │ └── re_data_columns_over_time.sql │ │ └── base │ │ └── re_data_base_metrics.sql ├── meta │ ├── re_data_selected.sql │ ├── re_data_monitored.sql │ └── re_data_columns.sql ├── alerts │ ├── re_data_test_runs.sql │ ├── re_data_alerts.sql │ ├── re_data_z_score.sql │ ├── re_data_anomalies.sql │ └── re_data_schema_changes.sql └── logs │ └── re_data_test_history.sql ├── static └── lineage_graph.png ├── packages.yml ├── .gitignore ├── .github ├── pull_request_template.md ├── ISSUE_TEMPLATE │ ├── documentation-request.md │ ├── bug_report.md │ └── feature_request.md └── workflows │ ├── generate-docs.yml │ └── run-db-tests.yml ├── Makefile ├── README.md ├── LICENSE ├── dbt_project.yml └── profiles.yml /data/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /analysis/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/macros/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/seeds/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/tests/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/analysis/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/snapshots/.gitkeep: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/python_tests/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /integration_tests/python_tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /data/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/re-data/dbt-re-data/HEAD/data/.DS_Store -------------------------------------------------------------------------------- /models/internal/samples/re_data_last_table_samples.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_table_samples() }} -------------------------------------------------------------------------------- /models/metrics/final/re_data_metrics.sql: -------------------------------------------------------------------------------- 1 | select * from {{ ref('re_data_base_metrics') }} -------------------------------------------------------------------------------- /integration_tests/requirements.txt: -------------------------------------------------------------------------------- 1 | protobuf==4.25.3 2 | pytest==6.2.5 3 | pyyaml==6.0 4 | -------------------------------------------------------------------------------- /integration_tests/README.md: -------------------------------------------------------------------------------- 1 | 2 | dbt project for running dbt_re_data integration tests 3 | 4 | -------------------------------------------------------------------------------- /macros/db/postgres/escape.sql: -------------------------------------------------------------------------------- 1 | {% macro postgres__escape_seq_for_json(chr) %}'\{{chr}}'{% endmacro %} -------------------------------------------------------------------------------- /macros/db/postgres/quote_string.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {%- macro postgres__quote_new_line() %}'\\n'{% endmacro %} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_part0.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_part() }} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_part1.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_part() }} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_part2.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_part() }} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_part3.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_part() }} -------------------------------------------------------------------------------- /static/lineage_graph.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/re-data/dbt-re-data/HEAD/static/lineage_graph.png -------------------------------------------------------------------------------- /macros/db/redshift/escape.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro redshift__escape_seq_for_json(chr) %}'\\\{{chr}}'{% endmacro %} -------------------------------------------------------------------------------- /macros/db/redshift/quote_string.sql: -------------------------------------------------------------------------------- 1 | 2 | {%- macro redshift__quote_new_line() %}'\134\134n'{% endmacro %} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_thread0.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_thread(0)}} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_thread1.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_thread(1)}} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_thread2.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_thread(2)}} -------------------------------------------------------------------------------- /models/internal/metrics/base/re_data_last_base_metrics_thread3.sql: -------------------------------------------------------------------------------- 1 | {{ re_data_last_base_metrics_thread(3)}} -------------------------------------------------------------------------------- /packages.yml: -------------------------------------------------------------------------------- 1 | 2 | packages: 3 | - package: dbt-labs/dbt_utils 4 | version: [">=1.0.0", "<1.2.0"] 5 | -------------------------------------------------------------------------------- /integration_tests/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | dbt_packages/ 5 | logs/ 6 | .user.yml 7 | package-lock.yml 8 | -------------------------------------------------------------------------------- /macros/db/bigquery/quote_string.sql: -------------------------------------------------------------------------------- 1 | 2 | {%- macro bigquery__quote_string(str) %} 3 | r"""{{ str }}""" 4 | {% endmacro %} -------------------------------------------------------------------------------- /macros/db/postgres/postgres_type_db.sql: -------------------------------------------------------------------------------- 1 | {% macro postgres_type_db() %} 2 | {{ ('postgres', 'greenplum') }} 3 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - local: ../ 3 | 4 | - package: dbt-labs/dbt_utils 5 | version: [">=1.0.0", "<1.2.0"] 6 | -------------------------------------------------------------------------------- /macros/public/cleaning/clean_capitalize_words.sql: -------------------------------------------------------------------------------- 1 | {% macro clean_capitalize_words(column_name) %} 2 | initcap( {{column_name}} ) 3 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/for_loops.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro print_list(l) %} 3 | {% for el in l %}{{el}}{% if not loop.last %},{% endif %}{% endfor %} 4 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/public_macros/filtering/duplicates.sql: -------------------------------------------------------------------------------- 1 | {{ re_data.filter_get_duplicates( 2 | ref('duplicated'), ['transaction_id'], ['creation_time']) }} 3 | -------------------------------------------------------------------------------- /integration_tests/python_tests/run_all_dbs.sh: -------------------------------------------------------------------------------- 1 | pytest --db postgres $@ & 2 | pytest --db snowflake $@ & 3 | pytest --db bigquery $@ & 4 | pytest --db redshift $@ & 5 | wait 6 | -------------------------------------------------------------------------------- /models/internal/samples/re_data_last_table_samples_part.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table', 4 | ) 5 | }} 6 | 7 | {{ re_data.empty_last_table_samples() }} -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | *.egg-info 3 | .DS_Store 4 | .vscode 5 | .python-version 6 | venv 7 | .idea/ 8 | logs/* 9 | dbt_modules/* 10 | dbt_packages/* 11 | target/* 12 | .env 13 | -------------------------------------------------------------------------------- /models/internal/re_data_run_started_at.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table', 4 | ) 5 | }} 6 | 7 | select {{ run_started_at.timestamp() * 1000000 }} as run_started_at -------------------------------------------------------------------------------- /.github/pull_request_template.md: -------------------------------------------------------------------------------- 1 | ## What 2 | *Describe what the change is solving* 3 | *It helps to add screenshots if it affects the frontend.* 4 | 5 | ## How 6 | *Describe the solution* 7 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/validate_emails.csv: -------------------------------------------------------------------------------- 1 | user_id,email 2 | 1,test@fakemail.com 3 | 2,novalidemail@ 4 | 3,novalidemail@com 5 | 4,test+alovalidemail@fakemail.com 6 | -------------------------------------------------------------------------------- /macros/utils/depends_macro.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_depends(used_tables) %} 2 | {% for t in used_tables %} 3 | -- depends_on: {{ ref(t) }} 4 | {% endfor %} 5 | 6 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/expected_table_samples.csv: -------------------------------------------------------------------------------- 1 | table_name,sample_data_length 2 | BUY_EVENTS,506 3 | RE_DATA_SOURCE_TEST_TABLE,361 4 | SAMPLE_TABLE,830 5 | SAMPLE_WITH_ANOMALY,507 6 | -------------------------------------------------------------------------------- /models/meta/re_data_selected.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | name, schema, database, time_filter, metrics, columns, anomaly_detector, owners 4 | from {{ ref('re_data_monitored')}} 5 | where 6 | selected = true -------------------------------------------------------------------------------- /macros/db/bigquery/quote_column_name.sql: -------------------------------------------------------------------------------- 1 | {% macro bigquery__quote_column_name(column_name) %} 2 | {% set quoted_col_name = '`' + column_name + '`' %} 3 | {{ return(quoted_col_name) }} 4 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/comma_delimited_list.sql: -------------------------------------------------------------------------------- 1 | {% macro comma_delimited_list(args) %} 2 | {%- for arg in args %} 3 | {{- arg -}} {{- ", " if not loop.last else "" -}} 4 | {% endfor %} 5 | {% endmacro %} 6 | -------------------------------------------------------------------------------- /macros/utils/comparison_text.sql: -------------------------------------------------------------------------------- 1 | {% macro comparison_text(a, b) %} 2 | case when {{a}} > {{b}} then 'greater than' 3 | when {{a}} = {{b}} then 'equal to' 4 | else 'less than' end 5 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | norecursedirs = dbt_modules models target logs data analysis macros snapshots tests 3 | python_files = test_*.py 4 | python_functions = test* 5 | addopts = --capture=no --durations=0 -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/expected_validated_emails.csv: -------------------------------------------------------------------------------- 1 | user_id,email,email_valid 2 | 1,test@fakemail.com,1 3 | 2,novalidemail@,0 4 | 3,novalidemail@com,0 5 | 4,test+alovalidemail@fakemail.com,0 6 | -------------------------------------------------------------------------------- /macros/db/core/escape.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro escape_seq_for_json(chr) %}{{adapter.dispatch('escape_seq_for_json', 're_data')(chr)}}{% endmacro %} 3 | 4 | {% macro default__escape_seq_for_json(chr) %}'\\\{{chr}}'{% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/public_macros/cleaning/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: sanitized_user_data 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_sample_user_data') -------------------------------------------------------------------------------- /integration_tests/models/public_macros/normalizing/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: us_states_normalized 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_us_states_normalized') -------------------------------------------------------------------------------- /macros/db/snowflake/identifier_mapping.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro snowflake__name_in_db(name) %} 3 | {% if name %} 4 | {{ return (name.upper()) }} 5 | {% else %} 6 | {{ return (name) }} 7 | {% endif %} 8 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/bool_to_string.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro bool_to_string(column) %} 3 | ( 4 | case when {{ column }} = true then 'true' 5 | when {{ column }} = false then 'false' 6 | end 7 | ) as {{ column }} 8 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/validate_numbers.csv: -------------------------------------------------------------------------------- 1 | number 2 | "133" 3 | "1232.232" 4 | "2332,123" 5 | "not a number" 6 | "1,3%" 7 | "123%" 8 | "13 %" 9 | "76.234%" 10 | "not a number" 11 | "x" 12 | "123partly987" 13 | -------------------------------------------------------------------------------- /macros/utils/is_list.sql: -------------------------------------------------------------------------------- 1 | {% macro is_list(obj) %} 2 | {% if not obj %} 3 | {{ return (False) }} 4 | {% endif %} 5 | {% set check = obj is iterable and (obj is not string and obj is not mapping) %} 6 | {{ return (check) }} 7 | {% endmacro %} -------------------------------------------------------------------------------- /macros/db/bigquery/split_and_return_nth_value.sql: -------------------------------------------------------------------------------- 1 | {% macro bigquery__split_and_return_nth_value(column_name, delimiter, ordinal) %} 2 | split({{ re_data.clean_blacklist(column_name, ['"', '`'], '') }}, '{{ delimiter }}')[ORDINAL( {{ ordinal }} )] 3 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/validate_date_and_time.csv: -------------------------------------------------------------------------------- 1 | date_time 2 | 31-01-2020 3 | 01/31/2020 4 | 05.05.2020 5 | 2020-01-31 6 | 23:59 7 | 12:59 8 | 13:59:01 9 | "12:59:01,55" 10 | 11:59:00 11 | midnight 12 | 2020-01-31T12:59:00+02:00 13 | 2020-01-31T12:59:00 -------------------------------------------------------------------------------- /integration_tests/models/public_macros/validating/validated_emails.sql: -------------------------------------------------------------------------------- 1 | with 2 | all_emails as ( 3 | select * from {{ ref('validate_emails') }} 4 | ) 5 | 6 | select *, case when {{ re_data.valid_email('email') }} then 1 else 0 end as email_valid 7 | from all_emails 8 | -------------------------------------------------------------------------------- /integration_tests/models/public_macros/validating/validated_uuids.sql: -------------------------------------------------------------------------------- 1 | with 2 | all_rows as ( 3 | select * from {{ ref('validate_uuid') }} 4 | ) 5 | 6 | select *, 7 | case when {{ re_data.valid_uuid('uuid') }} then 1 else 0 end as valid_uuid 8 | from all_rows 9 | -------------------------------------------------------------------------------- /integration_tests/models/monitoring/test_re_data_anomalies.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | {{ clean_table_name('table_name') }} as table_name, 4 | {{ clean_column_name('column_name') }} as column_name, 5 | metric, 6 | anomaly_detector, 7 | interval_length_sec 8 | 9 | from {{ ref('re_data_anomalies') }} -------------------------------------------------------------------------------- /macros/db/core/identifier_mapping.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro name_in_db(name) %} 3 | {% set translated = adapter.dispatch('name_in_db', 're_data')(name) %} 4 | {{ return(translated) }} 5 | 6 | {% endmacro %} 7 | 8 | {% macro default__name_in_db(name) %} 9 | {{ return(name) }} 10 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/public_macros/cleaning/sanitized_user_data.sql: -------------------------------------------------------------------------------- 1 | select 2 | {{ re_data.clean_capitalize_words(re_data.clean_additional_whitespaces('full_name')) }} as full_name, 3 | {{ re_data.clean_blacklist('email', ['^[a-zA-Z0-9_.+-]+'], '*****') }} as email 4 | from {{ ref('sample_user_data') }} 5 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/validate_uuid.csv: -------------------------------------------------------------------------------- 1 | uuid 2 | ace1245c-3af5-11ec-8d3d-0242ac130003 3 | a568464e-a05d-412c-8b30-517a46c57d88 4 | notanuid 5 | d0d61836-3af5-11ec-8d3d-0242ac130003 6 | d0d61c6e-3af5-11ec-8d3d-0242ac130003 7 | 343422-234324-234234-4234234-23432 8 | 343422-234324-234234-4234234-234xxx32 -------------------------------------------------------------------------------- /macros/utils/formulas.sql: -------------------------------------------------------------------------------- 1 | {% macro percentage_formula(summation, total) %} 2 | abs( 3 | ( 4 | cast({{ summation }} as {{ numeric_type() }}) 5 | ) / 6 | nullif( 7 | cast( {{ total }} as {{ numeric_type() }} ) 8 | , 0) * 100.0 9 | ) 10 | {% endmacro %} -------------------------------------------------------------------------------- /models/metrics/for_anomalies/re_data_last_metrics.sql: -------------------------------------------------------------------------------- 1 | select 2 | table_name, 3 | column_name, 4 | metric, 5 | value as last_value, 6 | interval_length_sec, 7 | computed_on 8 | from 9 | {{ ref('re_data_base_metrics') }} 10 | where 11 | time_window_end = {{- time_window_end() -}} 12 | 13 | -------------------------------------------------------------------------------- /macros/meta/monitored_model_queries.sql: -------------------------------------------------------------------------------- 1 | {% macro get_tables() %} 2 | select * 3 | from {{ ref('re_data_selected') }} 4 | order by name, schema, database, time_filter 5 | {% endmacro %} 6 | 7 | {% macro get_schemas() %} 8 | select distinct schema, database 9 | from {{ ref('re_data_selected') }} 10 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/public_macros/filtering/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: deduplicated 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_deduplicated') 8 | - name: duplicates 9 | tests: 10 | - dbt_utils.equality: 11 | compare_model: ref('expected_duplicates') -------------------------------------------------------------------------------- /macros/metrics/base/build_in/optional_table_metrics.sql: -------------------------------------------------------------------------------- 1 | {% macro re_data_metric_distinct_table_rows(context) %} 2 | with temp_table AS ( 3 | select distinct * from {{ context.table_name }} 4 | where {{ in_time_window(context.time_filter) }} 5 | ) 6 | select coalesce(count(*), 0) FROM temp_table 7 | {% endmacro %} 8 | -------------------------------------------------------------------------------- /macros/utils/quote.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% macro quote_column(col_name) %} 4 | {{ adapter.dispatch('quote_column', 're_data')(col_name) }} 5 | {% endmacro %} 6 | 7 | {% macro default__quote_column(col_name) %} 8 | "{{ col_name }}" 9 | {% endmacro %} 10 | 11 | {% macro bigquery__quote_column(col_name) %} 12 | `{{ col_name }}` 13 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/expected_validated_uuids.csv: -------------------------------------------------------------------------------- 1 | uuid,valid_uuid 2 | ace1245c-3af5-11ec-8d3d-0242ac130003,1 3 | a568464e-a05d-412c-8b30-517a46c57d88,1 4 | notanuid,0 5 | d0d61836-3af5-11ec-8d3d-0242ac130003,1 6 | d0d61c6e-3af5-11ec-8d3d-0242ac130003,1 7 | 343422-234324-234234-4234234-23432,0 8 | 343422-234324-234234-4234234-234xxx32,0 9 | -------------------------------------------------------------------------------- /models/alerts/re_data_test_runs.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | select 8 | sum(case when status = 'Fail' then 1 else 0 end) as failed, 9 | sum(case when status = 'Pass' then 1 else 0 end) as passed, 10 | run_at 11 | from {{ ref ('re_data_test_history') }} 12 | group by run_at 13 | order by run_at desc -------------------------------------------------------------------------------- /integration_tests/models/monitoring/test_re_data_metrics.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | {{ clean_table_name('table_name') }} as table_name, 4 | {{ clean_column_name('column_name') }} as column_name, 5 | metric, 6 | time_window_start, 7 | time_window_end, 8 | {{ to_big_integer('value') }}, 9 | interval_length_sec 10 | 11 | from {{ ref('re_data_metrics') }} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/expected_validated_credit_cards.csv: -------------------------------------------------------------------------------- 1 | credit_card_number,valid_credit_card 2 | 4941533405630082,1 3 | 5476749195896614,1 4 | 3568497486294461,1 5 | not_a_card_number,0 6 | 3434-4351-4234-3234,0 7 | 3434 4351 4234 3234,0 8 | 34344 42344 43455 43456,0 9 | 43423432,0 10 | 234343443434,0 11 | 2343423423423423423423423423,0 12 | -------------------------------------------------------------------------------- /macros/utils/dict_from_list.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro dict_from_list(el_list) %} 3 | 4 | {% if el_list is none %} 5 | {{ return (none) }} 6 | {% endif %} 7 | 8 | {% set for_cols_dict = {} %} 9 | {% for col in el_list %} 10 | {% do for_cols_dict.update({col: True})%} 11 | {% endfor %} 12 | {% do return(for_cols_dict) %} 13 | 14 | {% endmacro %} -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/documentation-request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Documentation request 3 | about: Describe this issue template's purpose here. 4 | title: "[DOCUMENTATION]" 5 | labels: documentation 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Tell us about the documentation you'd like us to add or update** 11 | 12 | **Is anything not clear or outdates in the current version of docs?** 13 | -------------------------------------------------------------------------------- /integration_tests/models/transformed/buy_events.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | re_data_monitored=true, 4 | re_data_time_filter='creation_time', 5 | re_data_anomaly_detector={'name': 'z_score', 'threshold': 0.5}, 6 | materialized='table', 7 | tags=['testtag'] 8 | ) 9 | }} 10 | select * 11 | from {{ ref('sample_with_anomaly') }} 12 | where event_type = 'buy' -------------------------------------------------------------------------------- /macros/db/core/quote_column_name.sql: -------------------------------------------------------------------------------- 1 | {% macro quote_column_name(column_name) %} 2 | {% set col_name = adapter.dispatch('quote_column_name', 're_data')(column_name) %} 3 | {{ return(col_name) }} 4 | {% endmacro %} 5 | 6 | 7 | {% macro default__quote_column_name(column_name) %} 8 | {% set quoted_col_name = '"' + column_name + '"' %} 9 | {{ return(quoted_col_name) }} 10 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/filtering/duplicated.csv: -------------------------------------------------------------------------------- 1 | transaction_id,creation_time,status,value 2 | 1,2021-05-01 12:31:32,pending,100 3 | 2,2021-05-01 12:35:35,pending,200 4 | 1,2021-05-01 12:40:35,completed,100 5 | 3,2021-05-01 12:40:35,pending,300 6 | 3,2021-05-02 12:31:32,completed,300 7 | 4,2021-05-02 12:35:35,completed,10 8 | 5,2021-05-02 12:40:35,pending,100 9 | 4,2021-05-02 12:40:35,completed,40 10 | -------------------------------------------------------------------------------- /macros/db/core/quote_string.sql: -------------------------------------------------------------------------------- 1 | 2 | {%- macro quote_string(str) %} 3 | {{ adapter.dispatch('quote_string', 're_data')(str) }} 4 | {% endmacro %} 5 | 6 | {%- macro default__quote_string(str) %} 7 | $${{ str }}$$ 8 | {% endmacro %} 9 | 10 | 11 | {%- macro quote_new_line() %}{{ adapter.dispatch('quote_new_line', 're_data')() }}{% endmacro %} 12 | 13 | {%- macro default__quote_new_line() %}'\134\134n'{% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/monitoring/test_re_data_test_history.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | {{ clean_table_name('table_name') }} as table_name, 4 | {{ clean_column_name('column_name') }} as column_name, 5 | right(test_name, 15) as test_name, 6 | status, 7 | {{ clean_column_name('message') }} as message, 8 | cast (failures_count as integer) as failures_count, 9 | severity 10 | from {{ ref('re_data_test_history') }} -------------------------------------------------------------------------------- /integration_tests/python_tests/conftest.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | def pytest_addoption(parser): 4 | parser.addoption("--db", action="store") 5 | parser.addoption("--source_schema", action="store") 6 | 7 | 8 | @pytest.fixture() 9 | def db(pytestconfig): 10 | return pytestconfig.getoption("db") 11 | 12 | @pytest.fixture() 13 | def source_schema(pytestconfig): 14 | return pytestconfig.getoption("source_schema") -------------------------------------------------------------------------------- /macros/db/core/split_and_return_nth_value.sql: -------------------------------------------------------------------------------- 1 | {% macro split_and_return_nth_value(column_name, delimiter, ordinal) -%} 2 | {{ adapter.dispatch('split_and_return_nth_value', 're_data')(column_name, delimiter, ordinal) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__split_and_return_nth_value(column_name, delimiter, ordinal) -%} 6 | split_part({{ re_data.clean_blacklist(column_name, ['"', '`'], '') }}, '{{ delimiter }}', {{ ordinal }}) 7 | {%- endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/monitoring/test_re_data_table_samples.sql: -------------------------------------------------------------------------------- 1 | 2 | select 3 | {{ clean_table_name('table_name') }} as table_name, 4 | length(sample_data) as sample_data_length 5 | from {{ ref('re_data_table_samples') }} 6 | where {{ clean_table_name('table_name') }} != 'SAMPLE_WITHOUT_TIME_FILTER' 7 | 8 | -- SAMPLE_WITHOUT_TIME_FILTER because this table doesn't have a time filter, it's not possible to say how 9 | -- exactly the sampel of it should look like. -------------------------------------------------------------------------------- /macros/post_hook/re_data_monitored.sql: -------------------------------------------------------------------------------- 1 | {% macro pub_insert_into_re_data_monitored() %} 2 | {% set monitored = re_data.pub_monitored_from_graph() %} 3 | {% do re_data.insert_list_to_table( 4 | this, 5 | monitored, 6 | ['name', 'schema', 'database', 'time_filter', 'metrics_groups', 'additional_metrics', 'metrics', 'columns', 'anomaly_detector', 'owners', 'selected'] 7 | ) %} 8 | 9 | {{ return('') }} 10 | 11 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/public_macros/validating/validated_ips.sql: -------------------------------------------------------------------------------- 1 | with 2 | all_rows as ( 3 | select * from {{ ref('validate_ip') }} 4 | ) 5 | 6 | select *, 7 | case when {{ re_data.valid_ip_v4('ip_address') }} then 1 else 0 end as valid_ip_v4, 8 | case when {{ re_data.valid_ip_v6('ip_address') }} then 1 else 0 end as valid_ip_v6, 9 | case when {{ re_data.valid_ip('ip_address') }} then 1 else 0 end as valid_ip 10 | from all_rows 11 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/filtering/expected_duplicates.csv: -------------------------------------------------------------------------------- 1 | "transaction_id","creation_time","status","value","re_data_duplicate_group_row_count","re_data_duplicate_group_row_number" 2 | 1,"2021-05-01 12:31:32","pending",100,2,1 3 | 1,"2021-05-01 12:40:35","completed",100,2,2 4 | 3,"2021-05-01 12:40:35","pending",300,2,1 5 | 3,"2021-05-02 12:31:32","completed",300,2,2 6 | 4,"2021-05-02 12:35:35","completed",10,2,1 7 | 4,"2021-05-02 12:40:35","completed",40,2,2 8 | -------------------------------------------------------------------------------- /integration_tests/models/sources/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: "{{ var('source_schema', target.schema) }}" 5 | tables: 6 | - name: re_data_source_test_table 7 | columns: 8 | - name: number 9 | tests: 10 | - not_null 11 | - unique 12 | 13 | - name: description 14 | tests: 15 | - not_null 16 | - unique 17 | 18 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/expected_validated_numbers.csv: -------------------------------------------------------------------------------- 1 | number,is_number,is_number_decimal_point,is_number_decimal_comma,is_percentage,is_percentage_decimal_point,is_percentage_decimal_comma 2 | 133,1,0,0,0,0,0 3 | 1232.232,0,1,0,0,0,0 4 | "2332,123",0,0,1,0,0,0 5 | not a number,0,0,0,0,0,0 6 | "1,3%",0,0,0,1,0,1 7 | 123%,0,0,0,1,1,1 8 | 13 %,0,0,0,0,0,0 9 | 76.234%,0,0,0,1,1,0 10 | not a number,0,0,0,0,0,0 11 | x,0,0,0,0,0,0 12 | 123partly987,0,0,0,0,0,0 13 | -------------------------------------------------------------------------------- /macros/utils/get_database.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro get_target_database() %} 3 | {{- adapter.dispatch('get_target_database', 're_data')() -}} 4 | {% endmacro %} 5 | 6 | {% macro default__get_target_database() %} 7 | {{- return (target.dbname) -}} 8 | {% endmacro %} 9 | 10 | {% macro bigquery__get_target_database() %} 11 | {{- return (target.project) -}} 12 | {% endmacro %} 13 | 14 | {% macro snowflake__get_target_database() %} 15 | {{- return (target.database) -}} 16 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/sample_table.csv: -------------------------------------------------------------------------------- 1 | creation_time,update_time,event_type,value1,value2,null_value,not_used_colum 2 | 2021-04-30 12:40:35,,buy,100,109,,1 3 | 2021-05-01 12:31:32,,buy,100,200,,1 4 | 2021-05-01 12:35:35,,buy,110,205,,1 5 | 2021-05-01 12:40:35,,sell,200,209,,1 6 | 2021-05-01 12:40:35,2021-05-01 12:40:37,buy,100,109,,1 7 | 2021-05-02 12:31:32,,buy,110,200,,1 8 | 2021-05-02 12:35:35,,buy,150,205,,1 9 | 2021-05-02 12:40:35,,buy,210,209,,1 10 | 2021-05-02 12:40:35,,buy,100,109,,1 11 | -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/sample_with_anomaly.csv: -------------------------------------------------------------------------------- 1 | creation_time,event_type,value1,value2 2 | 2021-04-30 12:40:35,buy,101,109 3 | 2021-05-01 12:31:32,buy,107,200 4 | 2021-05-02 12:35:35,buy,98,205 5 | 2021-05-03 12:40:35,sell,108,209 6 | 2021-05-04 12:40:35,buy,100,109 7 | 2021-05-05 12:31:32,buy,110,200 8 | 2021-05-06 12:35:35,buy,99,205 9 | 2021-05-07 12:40:35,buy,94,209 10 | 2021-05-08 12:40:35,buy,104,109 11 | 2021-05-09 12:31:32,buy,10,200 12 | 2021-05-10 12:35:35,buy,23,205 13 | 2021-05-11 12:40:35,sell,10,209 14 | -------------------------------------------------------------------------------- /macros/public/filtering/remove_duplicates.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {% macro filter_remove_duplicates(relation, unique_cols, sort_columns) %} 5 | ( 6 | with with_row_num as ( 7 | {{re_data.add_duplication_context(relation, unique_cols, sort_columns)}} 8 | ), 9 | one_row_num as ( 10 | select * from with_row_num where re_data_duplicate_group_row_number = 1 11 | ) 12 | select {{ dbt_utils.star(from=relation) }} 13 | from one_row_num 14 | ) 15 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/macros/trigger_schema_change.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro schema_change_buy_events_add_column() %} 3 | {% set alter_table %} 4 | alter table {{ ref('buy_events')}} add column sample_column boolean 5 | {% endset %} 6 | {% do run_query(alter_table) %} 7 | {% endmacro %} 8 | 9 | 10 | {% macro schema_change_buy_events_drop_column() %} 11 | {% set alter_table %} 12 | alter table {{ ref('buy_events')}} drop column sample_column 13 | {% endset %} 14 | {% do run_query(alter_table) %} 15 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/in_compile.sql: -------------------------------------------------------------------------------- 1 | {% macro in_compile() %} 2 | 3 | {%- call statement('in_compile', fetch_result=True) -%} 4 | select * from {{ ref('re_data_run_started_at') }} 5 | {%- endcall -%} 6 | 7 | {% if execute %} 8 | {%- set result = load_result('in_compile')['data'][0][0] -%} 9 | {% if result == run_started_at.timestamp() * 1000000 %} 10 | {{ return(False) }} 11 | {% else %} 12 | {{ return(True) }} 13 | {% endif %} 14 | {% endif %} 15 | 16 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/python_tests/test_cleaners.py: -------------------------------------------------------------------------------- 1 | from .utils.run import dbt_seed, dbt_run, dbt_test 2 | 3 | def test_cleaners(db, source_schema, debug=True): 4 | dbt_vars = { 5 | 'source_schema': source_schema 6 | } 7 | 8 | print (f"Running setup and tests for {db}") 9 | 10 | dbt_seed(f'--select public_macros.cleaning', db, dbt_vars) 11 | dbt_run(f'--select sanitized_user_data+', db, dbt_vars) 12 | dbt_test(f'--select sanitized_user_data', db, dbt_vars) 13 | 14 | print (f"Running tests completed for {db}") 15 | -------------------------------------------------------------------------------- /models/metrics/types/samples/re_data_table_samples.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table', 4 | unique_key = 'table_name', 5 | on_schema_change='sync_all_columns', 6 | ) 7 | }} 8 | 9 | -- depends_on: {{ ref('re_data_last_table_samples') }} 10 | -- depends_on: {{ ref('re_data_last_table_samples_part') }} 11 | 12 | select 13 | table_name, 14 | sample_data, 15 | cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as sampled_on 16 | 17 | from {{ ref('re_data_last_table_samples_part') }} 18 | -------------------------------------------------------------------------------- /integration_tests/python_tests/test_normalizers.py: -------------------------------------------------------------------------------- 1 | from .utils.run import dbt_seed, dbt_run, dbt_test 2 | 3 | def test_normalizers(db, source_schema, debug=True): 4 | dbt_vars = { 5 | 'source_schema': source_schema 6 | } 7 | 8 | print (f"Running setup and tests for {db}") 9 | 10 | dbt_seed(f'--select public_macros.normalizing', db, dbt_vars) 11 | dbt_run(f'--select us_states_normalized+', db, dbt_vars) 12 | dbt_test(f'--models us_states_normalized', db, dbt_vars) 13 | 14 | print (f"Running tests completed for {db}") 15 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/expected_validated_date_and_time.csv: -------------------------------------------------------------------------------- 1 | date_time,valid_date_eu,valid_date_us,valid_date_inverse,valid_date_iso_8601,valid_time_24h,valid_time_12h,valid_time 2 | 31-01-2020,1,0,0,0,0,0,0 3 | 01/31/2020,0,1,0,0,0,0,0 4 | 05.05.2020,1,1,0,0,0,0,0 5 | 2020-01-31,0,0,1,0,0,0,0 6 | 23:59,0,0,0,0,1,0,1 7 | 12:59,0,0,0,0,1,1,1 8 | 13:59:01,0,0,0,0,0,0,1 9 | "12:59:01,55",0,0,0,0,0,0,1 10 | 11:59:00,0,0,0,0,0,0,1 11 | midnight,0,0,0,0,0,0,0 12 | 2020-01-31T12:59:00+02:00,0,0,0,1,0,0,0 13 | 2020-01-31T12:59:00,0,0,0,1,0,0,0 14 | -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/bug_report.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Bug report 3 | about: Create a report to help us improve 4 | title: "[BUG]" 5 | labels: bug 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Describe the bug** 11 | Description of what the bug is. 12 | 13 | **Expected behavior** 14 | Description of what you expected to happen. 15 | 16 | **To Reproduce** 17 | Steps to reproduce the behavior: 18 | 19 | **Screenshots** 20 | If applicable, add screenshots to help explain your problem. 21 | 22 | **Logs and additional context** 23 | If application, any other context, logs etc.here 24 | -------------------------------------------------------------------------------- /integration_tests/python_tests/test_validate.py: -------------------------------------------------------------------------------- 1 | from .utils.run import dbt_seed, dbt_run, dbt_test 2 | 3 | def test_validate_regex(db, source_schema, debug=True): 4 | dbt_vars = { 5 | 'source_schema': source_schema 6 | } 7 | 8 | print (f"Running setup and tests for {db}") 9 | 10 | dbt_seed( 11 | f'--select public_macros.validating', db, dbt_vars 12 | ) 13 | 14 | dbt_run(f'--select public_macros.validating', db, dbt_vars) 15 | dbt_test(f'--select public_macros.validating', db, dbt_vars) 16 | 17 | print (f"Running tests completed for {db}") -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/sample_without_time_filter.csv: -------------------------------------------------------------------------------- 1 | title,rental_rate,rating 2 | Academy Dinosaur,0.99,PG-13 3 | Alamo Videotape,0.99,G 4 | Affair Prejudice,2.99,G 5 | African Egg,2.99,G 6 | Ace Goldfinger,4.99,G 7 | Alice Fantasia,0.99,NC-17 8 | Adaptation Holes,2.99,NC-17 9 | Alien Center,2.99,NC-17 10 | Aladdin Calendar,4.99,NC-17 11 | Chamber Italian,4.99,NC-17 12 | Alaska Phantom,0.99,PG 13 | Agent Truman,2.99,PG 14 | Ali Forever,4.99,PG 15 | Alabama Devil,2.99,PG-13 16 | Bright Encounters,4.99,PG-13 17 | Airplane Sierra,4.99,PG-13 18 | Date Speed,0.99,R 19 | Grosse Wonderful,4.99,R 20 | Airport Pollock,4.99,R -------------------------------------------------------------------------------- /macros/public/store/export_table_samples.sql: -------------------------------------------------------------------------------- 1 | {% macro export_table_samples(start_date, end_date, table_samples_path=None) %} 2 | {% set table_samples_query %} 3 | select 4 | lower(table_name) as table_name, 5 | sample_data, 6 | sampled_on 7 | from 8 | {{ ref('re_data_table_samples') }} 9 | {% endset %} 10 | 11 | {% set query_result = run_query(table_samples_query) %} 12 | {% set table_samples_file_path = table_samples_path or 'target/re_data/table_samples.json' %} 13 | {% do query_result.to_json(table_samples_file_path) %} 14 | 15 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/macros/test_utils.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro to_big_integer(field) %} 3 | cast (round({{field}} * 1000) as integer) as {{field}} 4 | {% endmacro %} 5 | 6 | {% macro clean_table_name(field) %} 7 | upper( 8 | {{- 9 | re_data.clean_blacklist( 10 | re_data.split_and_return_nth_value(field, '.', 3), 11 | ['"', '`'], 12 | '' 13 | ) 14 | -}} 15 | ) 16 | {% endmacro %} 17 | 18 | {% macro clean_column_name(field) %} 19 | case when ({{ field }} = '' or {{ field }} is null ) then '---' else upper({{field}}) end 20 | {% endmacro %} 21 | -------------------------------------------------------------------------------- /integration_tests/models/public_macros/filtering/deduplicated.sql: -------------------------------------------------------------------------------- 1 | with x as 2 | {{ re_data.filter_remove_duplicates( 3 | ref('duplicated'), ['transaction_id'], ['creation_time']) }} 4 | 5 | select *, 'take_first' as use_case from x 6 | 7 | union all 8 | 9 | select *, 'take_last' as use_case from {{ re_data.filter_remove_duplicates( 10 | ref('duplicated'), ['transaction_id'], ['creation_time desc']) }} duplicates 11 | 12 | 13 | union all 14 | 15 | select *, 'take_all_statuses' as use_case from {{ re_data.filter_remove_duplicates( 16 | ref('duplicated'), ['transaction_id', 'status'], ['creation_time desc']) }} duplicates -------------------------------------------------------------------------------- /macros/utils/regular_expression.sql: -------------------------------------------------------------------------------- 1 | {% macro regex_match_expression(column_name, pattern) %} 2 | {{ adapter.dispatch('regex_match_expression', 're_data')(column_name, pattern) }} 3 | {% endmacro %} 4 | 5 | {% macro default__regex_match_expression(column_name, pattern) %} 6 | ({{column_name}} ~ '{{pattern}}') 7 | {% endmacro %} 8 | 9 | {% macro bigquery__regex_match_expression(column_name, pattern) %} 10 | regexp_contains({{column_name}}, r'{{pattern}}') 11 | {% endmacro %} 12 | 13 | {% macro snowflake__regex_match_expression(column_name, pattern) %} 14 | regexp_like({{column_name | upper}}, '{{pattern}}') 15 | {% endmacro %} 16 | -------------------------------------------------------------------------------- /integration_tests/models/monitoring/test_re_data_z_score.sql: -------------------------------------------------------------------------------- 1 | 2 | {% set values_compare = [ 3 | 'z_score_value', 4 | 'modified_z_score_value', 5 | 'last_value', 6 | 'last_avg', 7 | 'last_stddev', 8 | 'last_median', 9 | 'last_iqr', 10 | 'last_median_absolute_deviation', 11 | 'last_mean_absolute_deviation', 12 | ] %} 13 | 14 | select 15 | {{ clean_table_name('table_name') }} as table_name, 16 | {{ clean_column_name('column_name') }} as column_name, 17 | metric, 18 | time_window_end, 19 | {% for col in values_compare %}{{ to_big_integer(col) }},{% endfor %} 20 | interval_length_sec 21 | 22 | from {{ ref('re_data_z_score') }} -------------------------------------------------------------------------------- /macros/utils/deduplication/add_duplication_context.sql: -------------------------------------------------------------------------------- 1 | {% macro add_duplication_context(relation, unique_cols, sort_columns) %} 2 | 3 | select {{ dbt_utils.star(from=relation) }} 4 | , count(*) over ( 5 | partition by {{ re_data.comma_delimited_list(unique_cols) }} 6 | ) as re_data_duplicate_group_row_count 7 | , row_number() over ( 8 | partition by {{ re_data.comma_delimited_list(unique_cols) }} {% if sort_columns %} order by {{ re_data.comma_delimited_list(sort_columns) }} {% endif %} 9 | ) as re_data_duplicate_group_row_number 10 | 11 | from {{ relation }} 12 | 13 | {% endmacro %} -------------------------------------------------------------------------------- /models/logs/re_data_test_history.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | on_schema_change='sync_all_columns', 5 | ) 6 | }} 7 | 8 | {{ 9 | re_data.empty_table_generic([ 10 | ('table_name', 'string'), 11 | ('column_name', 'string'), 12 | ('test_name', 'string'), 13 | ('status', 'string'), 14 | ('execution_time', 'numeric'), 15 | ('message', 'string'), 16 | ('failures_count', 'numeric'), 17 | ('failures_json', 'long_string'), 18 | ('failures_table', 'long_string'), 19 | ('severity', 'string'), 20 | ('compiled_sql', 'long_string'), 21 | ('run_at', 'timestamp') 22 | ]) 23 | }} -------------------------------------------------------------------------------- /models/meta/re_data_monitored.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table', 4 | unique_key = 'table_name', 5 | post_hook="{% if execute %}{{ pub_insert_into_re_data_monitored() }}{% endif %}" 6 | ) 7 | }} 8 | 9 | {{ 10 | re_data.empty_table_generic([ 11 | ('name', 'string'), 12 | ('schema', 'string'), 13 | ('database', 'string'), 14 | ('time_filter', 'string'), 15 | ('metrics_groups', 'string'), 16 | ('additional_metrics', 'string'), 17 | ('metrics', 'string'), 18 | ('columns', 'string'), 19 | ('anomaly_detector', 'string'), 20 | ('owners', 'string'), 21 | ('selected', 'boolean') 22 | ]) 23 | }} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/validate_ip.csv: -------------------------------------------------------------------------------- 1 | ip_address 2 | 1.2.3.4 3 | 01.102.103.104 4 | 124.171.228.4 5 | 192.168.1.35 6 | 192.168.1.198 7 | 127.248.111.240 8 | 01.1.1 9 | 12325412 10 | notvalidatall 11 | 232.232.33 12 | 232.3232.232.232+2312 13 | ::::erwerwe 14 | ::3343:4343434343:34343:343434343:443 15 | 16 | 2001:db8:3333:4444:5555:6666:7777:8888 17 | 2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF 18 | 2001:db8:: 19 | ::1234:5678 20 | 2001:db8::1234:5678 21 | ::11.22.33.44 22 | 2001:db8::123.123.123.123 23 | 2001:db8::1234:5678:5.6.7.8 24 | 2001:db8:3333:4444:5555:6666:1.2.3.4 25 | ::11.22.33.44 26 | 2001:db8::123.123.123.123 27 | ::1234:5678:91.123.4.56 28 | ::1234:5678:1.2.3.4 29 | 2001:db8::1234:5678:5.6.7.8 -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/normalizing/abbreviated_us_states.csv: -------------------------------------------------------------------------------- 1 | state,code 2 | Ala.,AL 3 | Alaska,AK 4 | Ariz.,AZ 5 | Ark.,AR 6 | Calif.,CA 7 | Colo.,CO 8 | Conn.,CT 9 | Del.,DE 10 | D.C.,DC 11 | Fla.,FL 12 | Ga.,GA 13 | Hawaii,HI 14 | Idaho,ID 15 | Ill.,IL 16 | Ind.,IN 17 | Iowa,IA 18 | Kans.,KS 19 | Ky.,KY 20 | La.,LA 21 | Maine,ME 22 | Md.,MD 23 | Mass.,MA 24 | Mich.,MI 25 | Minn.,MN 26 | Miss.,MS 27 | Mo.,MO 28 | Mont.,MT 29 | Nebr.,NE 30 | Nev.,NV 31 | N.H.,NH 32 | N.J.,NJ 33 | N.M.,NM 34 | N.Y.,NY 35 | N.C.,NC 36 | N.D.,ND 37 | Ohio,OH 38 | Okla.,OK 39 | Ore.,OR 40 | Pa.,PA 41 | R.I.,RI 42 | S.C.,SC 43 | S.D.,SD 44 | Tenn.,TN 45 | Tex.,TX 46 | Utah,UT 47 | Vt.,VT 48 | Va.,VA 49 | Wash.,WA 50 | W.Va.,WV 51 | Wis.,WI 52 | Wyo.,WY 53 | -------------------------------------------------------------------------------- /integration_tests/models/monitoring/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: test_re_data_metrics 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_metrics') 8 | 9 | - name: test_re_data_z_score 10 | tests: 11 | - dbt_utils.equality: 12 | compare_model: ref('expected_z_score') 13 | 14 | - name: test_re_data_anomalies 15 | tests: 16 | - dbt_utils.equality: 17 | compare_model: ref('expected_anomalies') 18 | 19 | - name: test_re_data_test_history 20 | tests: 21 | - dbt_utils.equality: 22 | compare_model: ref('expected_test_history') 23 | 24 | - name: test_re_data_table_samples 25 | tests: 26 | - dbt_utils.equality: 27 | compare_model: ref('expected_table_samples') -------------------------------------------------------------------------------- /integration_tests/models/public_macros/validating/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: validated_emails 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_validated_emails') 8 | 9 | - name: validated_numbers 10 | tests: 11 | - dbt_utils.equality: 12 | compare_model: ref('expected_validated_numbers') 13 | 14 | - name: validated_date_and_time 15 | tests: 16 | - dbt_utils.equality: 17 | compare_model: ref('expected_validated_date_and_time') 18 | 19 | - name: validated_ips 20 | tests: 21 | - dbt_utils.equality: 22 | compare_model: ref('expected_validated_ips') 23 | 24 | - name: validated_uuids 25 | tests: 26 | - dbt_utils.equality: 27 | compare_model: ref('expected_validated_uuids') -------------------------------------------------------------------------------- /.github/ISSUE_TEMPLATE/feature_request.md: -------------------------------------------------------------------------------- 1 | --- 2 | name: Feature request 3 | about: Suggest an idea for this project 4 | title: "[FEATURE]" 5 | labels: '' 6 | assignees: '' 7 | 8 | --- 9 | 10 | **Tell us about the problem you're trying to solve** 11 | What are you trying to do, and why is it hard? A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] 12 | 13 | **Describe the solution you’d like** 14 | A clear and concise description of what you want to see happen, or the change you would like to see 15 | 16 | **Describe the alternative you’ve considered or used** 17 | A clear and concise description of any alternative solutions or features you've considered or are using today. 18 | 19 | **Additional context** 20 | Add any other context or screenshots about the feature request here. 21 | -------------------------------------------------------------------------------- /macros/utils/json/to_single_json.sql: -------------------------------------------------------------------------------- 1 | {% macro to_json_string_value_or_null(column) %} 2 | ( 3 | case 4 | when {{ column }} is null then 'null' 5 | else '"' || 6 | regexp_replace( 7 | replace(cast({{ column }} as {{ string_type() }}), '"', {{ escape_seq_for_json('"') }}), 8 | '\n', {{ quote_new_line() }} {% if target.type in postgres_type_db() %}, 'g' {% endif %} 9 | ) || '"' 10 | end 11 | ) 12 | {% endmacro %} 13 | 14 | {% macro to_single_json(columns) %} 15 | '{' || 16 | {%- for column in columns %} 17 | '"{{ column }}": ' || 18 | {{ to_json_string_value_or_null(column) }} 19 | {%- if not loop.last %} || ',' || {%- endif %} 20 | {%- endfor %} 21 | || '}' 22 | {% endmacro %} 23 | -------------------------------------------------------------------------------- /integration_tests/models/public_macros/validating/validated_numbers.sql: -------------------------------------------------------------------------------- 1 | with 2 | all_num_rows as ( 3 | select * from {{ ref('validate_numbers') }} 4 | ) 5 | 6 | select *, 7 | case when {{ re_data.valid_number('number') }} then 1 else 0 end as is_number, 8 | case when {{ re_data.valid_number_decimal_point('number') }} then 1 else 0 end as is_number_decimal_point, 9 | case when {{ re_data.valid_number_decimal_comma('number') }} then 1 else 0 end as is_number_decimal_comma, 10 | case when {{ re_data.valid_number_percentage('number') }} then 1 else 0 end as is_percentage, 11 | case when {{ re_data.valid_number_percentage_point('number') }} then 1 else 0 end as is_percentage_decimal_point, 12 | case when {{ re_data.valid_number_percentage_comma('number') }} then 1 else 0 end as is_percentage_decimal_comma 13 | from all_num_rows 14 | -------------------------------------------------------------------------------- /macros/meta/save_monitored.sql: -------------------------------------------------------------------------------- 1 | {% macro save_monitored(monitored_path) %} 2 | 3 | {% set monitored_query %} 4 | select 5 | {{ full_table_name('name', 'schema', 'database') }} as {{ re_data.quote_column('model') }}, 6 | time_filter as {{ re_data.quote_column('time_filter') }}, 7 | metrics as {{ re_data.quote_column('metrics') }}, 8 | columns as {{ re_data.quote_column('columns') }}, 9 | anomaly_detector as {{ re_data.quote_column('anomaly_detector') }}, 10 | owners as {{ re_data.quote_column('owners') }} 11 | from {{ ref('re_data_selected') }} 12 | {% endset %} 13 | {% set query_result = run_query(monitored_query) %} 14 | {% set monitored_file_path = monitored_path or 'target/re_data/monitored.json' %} 15 | {% do query_result.to_json(monitored_file_path) %} 16 | 17 | {% endmacro %} -------------------------------------------------------------------------------- /macros/metrics/base/internal_model_template.sql: -------------------------------------------------------------------------------- 1 | {% macro re_data_last_base_metrics_part() %} 2 | 3 | -- depends_on: {{ ref('re_data_columns') }} 4 | 5 | {{ 6 | config( 7 | materialized='table', 8 | ) 9 | }} 10 | 11 | {{ re_data.empty_last_base_metrics() }} 12 | 13 | {% endmacro %} 14 | 15 | {% macro re_data_last_base_metrics_thread(num) %} 16 | {% set part_name = 're_data_last_base_metrics_part' ~ num %} 17 | {{ re_data.generate_depends(['re_data_selected', 're_data_monitored', 're_data_columns', 're_data_run_started_at', part_name]) }} 18 | 19 | {{ 20 | config( 21 | materialized='table', 22 | ) 23 | }} 24 | 25 | {% if not re_data.in_compile() %} 26 | {{ re_data.metrics_base_compute_for_thread(num, part_name) }} 27 | {% endif %} 28 | 29 | {{ re_data.empty_last_base_metrics() }} 30 | 31 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/agate/row_value.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro row_value(agate_row, column) %} 3 | {{ return (agate_row[re_data.name_in_db(column)]) }} 4 | {% endmacro %} 5 | 6 | {% macro agate_to_list(table) %} 7 | {% set col_names = table.column_names %} 8 | {% set query_result = [] %} 9 | {% for row in table.rows %} 10 | {% set pairs = [] %} 11 | {% for col_name in col_names %} 12 | {% set value = row.get(col_name) | string %} 13 | {% do pairs.append('"' ~ (col_name | lower) ~ '":' ~ '"' ~ (value | replace('"', '\\\"') | replace('\n', '\\n') ) ~ '"') %} 14 | {% endfor %} 15 | {% set joined_pairs = '{' ~ (pairs | join(',')) ~ '}' %} 16 | {% do query_result.append(joined_pairs) %} 17 | {% endfor %} 18 | {% set query_result = '[' ~ (query_result | join(',')) ~ ']' %} 19 | {{ return (query_result) }} 20 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/models/public_macros/validating/validated_date_and_time.sql: -------------------------------------------------------------------------------- 1 | with 2 | all_rows as ( 3 | select * from {{ ref('validate_date_and_time') }} 4 | ) 5 | 6 | select *, 7 | case when {{ re_data.valid_date_eu('date_time') }} then 1 else 0 end as valid_date_eu, 8 | case when {{ re_data.valid_date_us('date_time') }} then 1 else 0 end as valid_date_us, 9 | case when {{ re_data.valid_date_inverse('date_time') }} then 1 else 0 end as valid_date_inverse, 10 | case when {{ re_data.valid_date_iso_8601('date_time') }} then 1 else 0 end as valid_date_iso_8601, 11 | case when {{ re_data.valid_time_24h('date_time') }} then 1 else 0 end as valid_time_24h, 12 | case when {{ re_data.valid_time_12h('date_time') }} then 1 else 0 end as valid_time_12h, 13 | case when {{ re_data.valid_time('date_time') }} then 1 else 0 end as valid_time 14 | from all_rows 15 | -------------------------------------------------------------------------------- /integration_tests/python_tests/test_filters.py: -------------------------------------------------------------------------------- 1 | from .utils.run import dbt_seed, dbt_run, dbt_test 2 | 3 | def test_deduplication(db, source_schema, debug=True): 4 | dbt_vars = { 5 | 'source_schema': source_schema 6 | } 7 | 8 | print (f"Running setup and tests for {db}") 9 | 10 | dbt_seed(f'--select public_macros.filtering', db, dbt_vars) 11 | dbt_run(f'--select deduplicated', db, dbt_vars) 12 | dbt_test(f'--select deduplicated', db, dbt_vars) 13 | 14 | def test_get_duplicates(db, source_schema, debug=True): 15 | dbt_vars = { 16 | 'source_schema': source_schema 17 | } 18 | 19 | print (f"Running setup and tests for {db}") 20 | 21 | dbt_seed(f'--select public_macros.filtering', db, dbt_vars) 22 | dbt_run(f'--select duplicates', db, dbt_vars) 23 | dbt_test(f'--select duplicates', db, dbt_vars) 24 | 25 | print (f"Running tests completed for {db}") 26 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | 2 | # Maybe change this if you are not running on a Mac 3 | CONTAINER_ARCH = linux/amd64 4 | 5 | .PHONY: help run-all-ci test-postgres generate-docs 6 | 7 | help: 8 | $(info ${HELP_MESSAGE}) 9 | @exit 0 10 | 11 | 12 | # Run GitHub Actions CI jobs locally 13 | run-all-ci: test-postgres generate-docs 14 | @echo "All CI steps completed." 15 | 16 | test-postgres: 17 | @echo "Running test-postgres job..." 18 | act -j test-postgres --container-architecture $(CONTAINER_ARCH) 19 | 20 | generate-docs: 21 | @echo "Running generate-docs job..." 22 | act -j generate-docs --container-architecture $(CONTAINER_ARCH) 23 | 24 | 25 | define HELP_MESSAGE 26 | Usage: $ make [TARGETS] 27 | 28 | TARGETS 29 | help Shows this help message 30 | run-all-ci Runs all CI steps 31 | test-postgres Runs test-postgres job 32 | generate-docs Generates documentation 33 | 34 | endef 35 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/cleaning/expected_sample_user_data.csv: -------------------------------------------------------------------------------- 1 | "full_name","email" 2 | "Lizzie Effertz","*****@fakemail.com" 3 | "Orlando Abbott","*****@fakemail.com" 4 | "Kelley Harann","*****@fakemail.com" 5 | "Ruth Langworth","*****@fakemail.com" 6 | "Lane Swift","*****@fakemail.com" 7 | "Bertha Corwin","*****@fakemail.com" 8 | "Manuela Kling","*****@fakemail.com" 9 | "Mose Balistreri","*****@fakemail.com" 10 | "Robin Halvorson","*****@fakemail.com" 11 | "Osbaldo Parker I","*****@fakemail.com" 12 | "Javier Runolfsson","*****@fakemail.net" 13 | "Amelia Batz","*****@fakemail.com" 14 | "Abby Pouros","*****@fakemail.com" 15 | "Markus Homenick","*****@fakemail.com" 16 | "Braeden Turner","*****@fakemail.com" 17 | "Horacio Parker","*****@fakemail.info" 18 | "Ms. Stacy Padberg","*****@fakemail.com" 19 | "Dr. Deshawn Stracke","*****@fakemail.com" 20 | "Pascale Grady","*****@fakemail.com" 21 | "Lacy Brekke","*****@fakemail.com" 22 | -------------------------------------------------------------------------------- /integration_tests/models/metrics/re_data_metrics.yml: -------------------------------------------------------------------------------- 1 | 2 | version: 2 3 | 4 | models: 5 | - name: re_data_metrics 6 | tests: 7 | - re_data.metric_expression_is_true: 8 | table: ref('buy_events') 9 | metric: max_length 10 | column_name: event_type 11 | expression: value = 3 12 | 13 | - re_data.metric_equal_to: 14 | table: ref('buy_events') 15 | metric: max_length 16 | column_name: event_type 17 | value: 3 18 | 19 | - re_data.metric_in_range: 20 | table: ref('buy_events') 21 | metric: max_length 22 | column_name: event_type 23 | min_value: 3 24 | max_value: 3 25 | 26 | - re_data.metric_expression_is_true: 27 | table: ref('buy_events') 28 | metric: row_count 29 | expression: value > 0 and value < 10 30 | condition: time_window_start >= '2021-05-02' -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/validating/expected_validated_ips.csv: -------------------------------------------------------------------------------- 1 | ip_address,valid_ip_v4,valid_ip_v6,valid_ip 2 | 1.2.3.4,1,0,1 3 | 01.102.103.104,1,0,1 4 | 124.171.228.4,1,0,1 5 | 192.168.1.35,1,0,1 6 | 192.168.1.198,1,0,1 7 | 127.248.111.240,1,0,1 8 | 01.1.1,0,0,0 9 | 12325412,0,0,0 10 | notvalidatall,0,0,0 11 | 232.232.33,0,0,0 12 | 232.3232.232.232+2312,0,0,0 13 | ::::erwerwe,0,0,0 14 | ::3343:4343434343:34343:343434343:443,0,0,0 15 | ,0,0,0 16 | 2001:db8:3333:4444:5555:6666:7777:8888,0,1,1 17 | 2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF,0,1,1 18 | 2001:db8::,0,1,1 19 | ::1234:5678,0,1,1 20 | 2001:db8::1234:5678,0,1,1 21 | ::11.22.33.44,0,1,1 22 | 2001:db8::123.123.123.123,0,1,1 23 | 2001:db8::1234:5678:5.6.7.8,0,1,1 24 | 2001:db8:3333:4444:5555:6666:1.2.3.4,0,1,1 25 | ::11.22.33.44,0,1,1 26 | 2001:db8::123.123.123.123,0,1,1 27 | ::1234:5678:91.123.4.56,0,1,1 28 | ::1234:5678:1.2.3.4,0,1,1 29 | 2001:db8::1234:5678:5.6.7.8,0,1,1 30 | -------------------------------------------------------------------------------- /macros/public/store/export_tests_history.sql: -------------------------------------------------------------------------------- 1 | {% macro export_tests_history(start_date, end_date, tests_history_path=None) %} 2 | {% set tests_history_query %} 3 | select 4 | table_name, 5 | column_name, 6 | test_name, 7 | run_at, 8 | status, 9 | execution_time, 10 | message, 11 | failures_count, 12 | failures_json, 13 | failures_table, 14 | severity, 15 | compiled_sql 16 | from 17 | {{ ref('re_data_test_history') }} 18 | where {{ in_date_window('run_at', start_date, end_date) }} 19 | {% endset %} 20 | 21 | {% set query_result = run_query(tests_history_query) %} 22 | {% set tests_history_file_path = tests_history_path or 'target/re_data/tests_history.json' %} 23 | {% do query_result.to_json(tests_history_file_path) %} 24 | 25 | {% endmacro %} 26 | -------------------------------------------------------------------------------- /integration_tests/macros/my_metrics.sql: -------------------------------------------------------------------------------- 1 | {% macro re_data_metric_diff(context) %} 2 | max({{context.column_name}}) - min({{context.column_name}}) 3 | {% endmacro %} 4 | 5 | {% macro re_data_metric_my_custom_table_metric(context) %} 6 | 1000 7 | {% endmacro %} 8 | 9 | {% macro re_data_metric_regex_test(context) %} 10 | {{ regex_test(context.column_name, context.config.regex) }} 11 | {% endmacro %} 12 | 13 | {% macro regex_test(column_name, pattern) %} 14 | coalesce( 15 | sum( 16 | case when {{ re_data.regex_match_expression(column_name, pattern) }} 17 | then 1 18 | else 0 19 | end 20 | ), 0 21 | ) 22 | {% endmacro %} 23 | 24 | {% macro re_data_metric_my_distinct_table_rows(context) %} 25 | with temp_table AS ( 26 | select distinct * from {{ context.table_name }} 27 | ) 28 | select coalesce(count(*), 0) FROM temp_table 29 | {% endmacro %} -------------------------------------------------------------------------------- /macros/public/filtering/get_duplicates.sql: -------------------------------------------------------------------------------- 1 | {% https://github.com/re-data/re-data/issues/143 %} 2 | 3 | {# 4 | macro returns rows with she same key set (unique_cols) 5 | 6 | along with the fields of the base model duplicates information added: 7 | re_data_duplicate_count - total number of duplicates with the same current key set 8 | re_data_duplicate_row_number - number of a duplicate row inside the group of duplicates with the same current key set 9 | #} 10 | 11 | {% macro filter_get_duplicates(relation, unique_cols, sort_columns) %} 12 | ( 13 | with duplication_context as ( 14 | {{re_data.add_duplication_context(relation, unique_cols, sort_columns)}} 15 | ), 16 | duplicate_rows as ( 17 | select * from duplication_context where re_data_duplicate_group_row_count > 1 18 | ) 19 | {# return surrogate key as well? #} 20 | select * 21 | from duplicate_rows 22 | ) 23 | {% endmacro %} -------------------------------------------------------------------------------- /macros/public/cleaning/clean_additional_whitespace.sql: -------------------------------------------------------------------------------- 1 | {% macro clean_additional_whitespaces(column_name) %} 2 | {{ adapter.dispatch('clean_additional_whitespaces', 're_data')(column_name) }} 3 | {% endmacro %} 4 | 5 | {% macro default__clean_additional_whitespaces(column_name) %} 6 | trim(regexp_replace( {{ column_name }}, '\s\s+', ' ')) 7 | {% endmacro %} 8 | 9 | {% macro postgres__clean_additional_whitespaces(column_name) %} 10 | trim(regexp_replace( {{ column_name }}, '\s\s+', ' ', 'g')) 11 | {% endmacro %} 12 | 13 | {% macro redshift__clean_additional_whitespaces(column_name) %} 14 | trim(regexp_replace( {{ column_name }}, '\\s\\s+', ' ')) 15 | {% endmacro %} 16 | 17 | {% macro bigquery__clean_additional_whitespaces(column_name) %} 18 | trim(regexp_replace( {{ column_name }}, r'\s\s+', ' ')) 19 | {% endmacro %} 20 | 21 | {% macro snowflake__clean_additional_whitespaces(column_name) %} 22 | trim(regexp_replace( {{ column_name }}, '\\s\\s+', ' ')) 23 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/filtering/expected_deduplicated.csv: -------------------------------------------------------------------------------- 1 | transaction_id,creation_time,status,value,use_case 2 | 1,2021-05-01 12:31:32,pending,100,take_first 3 | 2,2021-05-01 12:35:35,pending,200,take_first 4 | 3,2021-05-01 12:40:35,pending,300,take_first 5 | 4,2021-05-02 12:35:35,completed,10,take_first 6 | 5,2021-05-02 12:40:35,pending,100,take_first 7 | 1,2021-05-01 12:40:35,completed,100,take_last 8 | 2,2021-05-01 12:35:35,pending,200,take_last 9 | 3,2021-05-02 12:31:32,completed,300,take_last 10 | 4,2021-05-02 12:40:35,completed,40,take_last 11 | 5,2021-05-02 12:40:35,pending,100,take_last 12 | 1,2021-05-01 12:40:35,completed,100,take_all_statuses 13 | 1,2021-05-01 12:31:32,pending,100,take_all_statuses 14 | 2,2021-05-01 12:35:35,pending,200,take_all_statuses 15 | 3,2021-05-02 12:31:32,completed,300,take_all_statuses 16 | 3,2021-05-01 12:40:35,pending,300,take_all_statuses 17 | 4,2021-05-02 12:40:35,completed,40,take_all_statuses 18 | 5,2021-05-02 12:40:35,pending,100,take_all_statuses 19 | -------------------------------------------------------------------------------- /integration_tests/python_tests/utils/run.py: -------------------------------------------------------------------------------- 1 | import os 2 | import yaml 3 | 4 | def dbt_command(command, for_db, dbt_vars, threads=None): 5 | debug = 'DBT_MACRO_DEBUGGING=1 ' 6 | profile_part = f' --profile re_data_{for_db}' 7 | yaml_vars = yaml.dump(dbt_vars) 8 | cmd = f'{debug} {command} --vars "{yaml_vars}" {profile_part}' 9 | if threads: 10 | cmd += f' --threads {threads}' 11 | assert os.system(cmd) == 0 12 | 13 | def dbt_seed(args, for_db, dbt_vars): 14 | dbt_command(f'dbt seed --full-refresh {args}', for_db, dbt_vars, threads=4) 15 | 16 | def dbt_run(args, for_db, dbt_vars): 17 | dbt_command(f'dbt run --full-refresh --fail-fast {args}', for_db, dbt_vars, threads=4) 18 | 19 | def dbt_test(args, for_db, dbt_vars): 20 | dbt_command(f'dbt test --store-failures --fail-fast {args}', for_db, dbt_vars, threads=4) 21 | 22 | def dbt_build(args, for_db, dbt_vars): 23 | dbt_command(f'dbt build --full-refresh --store-failures --fail-fast {args}', for_db, dbt_vars, threads=4) 24 | -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/cleaning/sample_user_data.csv: -------------------------------------------------------------------------------- 1 | full_name,email 2 | lizzie effertz,torp.trisha@fakemail.com 3 | orlando abbott,dayton.hermiston@fakemail.com 4 | kelley harann,borer.blake@fakemail.com 5 | ruth langworth,garett66@fakemail.com 6 | lane swift ,nienow.coralie@fakemail.com 7 | bertha corwin ,tstroman@fakemail.com 8 | manuela kling,shawn.langworth@fakemail.com 9 | mose balistreri,dorris70@fakemail.com 10 | robin halvorson,murazik.americo@fakemail.com 11 | osbaldo parker i ,friesen.angeline@fakemail.com 12 | javier runolfsson ,benjamin.bailey@fakemail.net 13 | amelia batz,garrison60@fakemail.com 14 | abby pouros,dominique.leannon@fakemail.com 15 | markus homenick,piper73@fakemail.com 16 | braeden turner,kozey.jace@fakemail.com 17 | horacio parker,vtillman@fakemail.info 18 | ms. stacy padberg,erdman.elaina@fakemail.com 19 | dr. deshawn stracke,rosendo.beer@fakemail.com 20 | pascale grady,princess60@fakemail.com 21 | lacy brekke,romaguera.darrell@fakemail.com 22 | -------------------------------------------------------------------------------- /macros/meta/information_schema.sql: -------------------------------------------------------------------------------- 1 | {% macro get_monitored_columns(schema, database) %} 2 | {{ adapter.dispatch('get_monitored_columns', 're_data')(schema, database) }} 3 | {% endmacro %} 4 | 5 | {% macro default__get_monitored_columns(table_schema, db_name) %} 6 | {% set relation = api.Relation.create(database=db_name, schema=table_schema) %} 7 | select 8 | table_name, 9 | table_schema, 10 | table_catalog, 11 | column_name, 12 | data_type, 13 | is_nullable 14 | from 15 | {{ relation.information_schema('COLUMNS') }} 16 | where 17 | table_schema = '{{ table_schema }}' 18 | {% endmacro %} 19 | 20 | {% macro redshift__get_monitored_columns(table_schema, db_name) %} 21 | select 22 | table_name, 23 | table_schema, 24 | table_catalog, 25 | column_name, 26 | data_type, 27 | is_nullable 28 | from 29 | svv_columns 30 | where 31 | table_schema = '{{ table_schema }}' 32 | {% endmacro %} 33 | -------------------------------------------------------------------------------- /models/metrics/types/schema/re_data_columns_over_time.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key = 'id', 5 | on_schema_change='sync_all_columns', 6 | ) 7 | }} 8 | 9 | 10 | with columns as ( 11 | 12 | select 13 | {{ full_table_name('cols.name', 'cols.schema', 'cols.database') }} as table_name, 14 | cols.column_name, 15 | cols.data_type, 16 | cols.is_nullable, 17 | cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }} ) as detected_time 18 | from 19 | {{ ref('re_data_columns')}} cols, {{ ref('re_data_selected')}} tables 20 | where 21 | cols.name = tables.name and cols.schema = tables.schema and cols.database = tables.database 22 | ) 23 | 24 | select 25 | cast ({{ dbt_utils.generate_surrogate_key([ 26 | 'table_name', 27 | 'column_name', 28 | 'detected_time' 29 | ]) }} as {{ string_type() }} ) as id, 30 | table_name, 31 | column_name, 32 | data_type, 33 | is_nullable, 34 | detected_time 35 | from columns -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/normalizing/us_states_normalization.csv: -------------------------------------------------------------------------------- 1 | source,target 2 | Ala.,Alabama 3 | Alaska,Alaska 4 | Ariz.,Arizona 5 | Ark.,Arkansas 6 | Calif.,California 7 | Colo.,Colorado 8 | Conn.,Connecticut 9 | Del.,Delaware 10 | D.C.,District of Columbia 11 | Fla.,Florida 12 | Ga.,Georgia 13 | Hawaii,Hawaii 14 | Idaho,Idaho 15 | Ill.,Illinois 16 | Ind.,Indiana 17 | Iowa,Iowa 18 | Kans.,Kansas 19 | Ky.,Kentucky 20 | La.,Louisiana 21 | Maine,Maine 22 | Md.,Maryland 23 | Mass.,Massachusetts 24 | Mich.,Michigan 25 | Minn.,Minnesota 26 | Miss.,Mississippi 27 | Mo.,Missouri 28 | Mont.,Montana 29 | Nebr.,Nebraska 30 | Nev.,Nevada 31 | N.H.,New Hampshire 32 | N.J.,New Jersey 33 | N.M.,New Mexico 34 | N.Y.,New York 35 | N.C.,North Carolina 36 | N.D.,North Dakota 37 | Ohio,Ohio 38 | Okla.,Oklahoma 39 | Ore.,Oregon 40 | Pa.,Pennsylvania 41 | R.I.,Rhode Island 42 | S.C.,South Carolina 43 | S.D.,South Dakota 44 | Tenn.,Tennessee 45 | Tex.,Texas 46 | Utah,Utah 47 | Vt.,Vermont 48 | Va.,Virginia 49 | Wash.,Washington 50 | W.Va.,West Virginia 51 | Wis.,Wisconsin 52 | Wyo.,Wyoming -------------------------------------------------------------------------------- /models/alerts/re_data_alerts.sql: -------------------------------------------------------------------------------- 1 | select 2 | 'anomaly' as type, 3 | {{ re_data.clean_blacklist('table_name', ['"', '`'], '') }} as model, 4 | message, 5 | last_value_text as value, 6 | time_window_end 7 | from 8 | {{ ref(var('re_data:re_data_anomalies_filtered')) }} 9 | union all 10 | 11 | select 12 | 'schema_change' as type, 13 | {{ re_data.clean_blacklist('table_name', ['"', '`'], '') }} as model, 14 | {{ generate_schema_change_message('operation', 'column_name', 'prev_column_name', 'prev_data_type', 'data_type', 'detected_time') }} as message, 15 | '' as value, 16 | detected_time as time_window_end 17 | from {{ ref('re_data_schema_changes') }} 18 | 19 | union all 20 | 21 | select 22 | 'test' as type, 23 | table_name as model, 24 | {{ generate_failed_test_message('test_name', 'column_name') }}, 25 | status as value, 26 | run_at as time_window_end 27 | 28 | from {{ ref('re_data_test_history') }} 29 | where 30 | status = 'Fail' 31 | or status = 'Error' 32 | {% if var('re_data:show_warns_as_alerts') %} 33 | or status = 'Warn' 34 | {% endif %} 35 | -------------------------------------------------------------------------------- /macros/public/store/export_alerts.sql: -------------------------------------------------------------------------------- 1 | {% macro export_alerts(start_date, end_date, alerts_path=None, monitored_path=None) %} 2 | {% set alerts_query %} 3 | select 4 | type as {{ re_data.quote_column('type') }}, 5 | model as {{ re_data.quote_column('model') }}, 6 | message as {{ re_data.quote_column('message') }}, 7 | value as {{ re_data.quote_column('value') }}, 8 | {{ format_timestamp('time_window_end')}} as {{ re_data.quote_column('time_window_end') }} 9 | from {{ ref('re_data_alerts') }} 10 | where 11 | case 12 | when type = 'anomaly' then {{ in_date_window('time_window_end', start_date, end_date) }} 13 | else {{ in_date_window('time_window_end', start_date, none) }} 14 | end 15 | order by time_window_end desc 16 | {% endset %} 17 | 18 | {% set query_result = run_query(alerts_query) %} 19 | {% set alerts_file_path = alerts_path or 'target/re_data/alerts.json' %} 20 | {% do query_result.to_json(alerts_file_path) %} 21 | {{ save_monitored(monitored_path) }} 22 | {% endmacro %} 23 | -------------------------------------------------------------------------------- /integration_tests/macros/create_test_source_tables.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% macro create_test_source_tables() %} 4 | 5 | {% set create_table %} 6 | CREATE SCHEMA IF NOT EXISTS {{target.schema}}; 7 | DROP TABLE IF EXISTS {{target.schema}}.re_data_source_test_table; 8 | CREATE TABLE IF NOT EXISTS {{target.schema}}.re_data_source_test_table ( 9 | number {{ re_data.integer_type() }}, 10 | description {{ re_data.string_type() }}, 11 | created_at {{ re_data.timestamp_type() }} 12 | ); 13 | INSERT INTO {{target.schema}}.re_data_source_test_table (number, description, created_at) VALUES 14 | (1, 'one', current_timestamp), 15 | (2, 'two', current_timestamp), 16 | (3, 'three', current_timestamp), 17 | (4, 'four', current_timestamp), 18 | (5, 'five', current_timestamp), 19 | (6, 'six', current_timestamp), 20 | (7, 'seven', current_timestamp), 21 | (8, 'eight', current_timestamp), 22 | (9, 'nine', current_timestamp), 23 | (10, 'ten', current_timestamp 24 | ); 25 | {% endset %} 26 | {% do run_query(create_table) %} 27 | 28 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/fivetran_utils/json_extract.sql: -------------------------------------------------------------------------------- 1 | {# 2 | # This file contains significant part of code derived from 3 | # https://github.com/fivetran/dbt_fivetran_utils/tree/v0.4.0 which is licensed under Apache License 2.0. 4 | #} 5 | 6 | {% macro json_extract(string, string_path) -%} 7 | 8 | {{ adapter.dispatch('json_extract','re_data') (string, string_path) }} 9 | 10 | {%- endmacro %} 11 | 12 | {% macro default__json_extract(string, string_path) %} 13 | 14 | json_extract_path_text({{string}}, {{ "'" ~ string_path ~ "'" }} ) 15 | 16 | {% endmacro %} 17 | 18 | {% macro snowflake__json_extract(string, string_path) %} 19 | 20 | json_extract_path_text(try_parse_json( {{string}} ), {{ "'" ~ string_path ~ "'" }} ) 21 | 22 | {% endmacro %} 23 | 24 | {% macro redshift__json_extract(string, string_path) %} 25 | 26 | case when is_valid_json( {{string}} ) then json_extract_path_text({{string}}, {{ "'" ~ string_path ~ "'" }} ) else null end 27 | 28 | {% endmacro %} 29 | 30 | {% macro bigquery__json_extract(string, string_path) %} 31 | 32 | json_extract_scalar({{string}}, {{ "'$." ~ string_path ~ "'" }} ) 33 | 34 | {% endmacro %} 35 | 36 | {% macro postgres__json_extract(string, string_path) %} 37 | 38 | {{string}}::json->>{{"'" ~ string_path ~ "'" }} 39 | 40 | {% endmacro %} 41 | -------------------------------------------------------------------------------- /macros/public/normalizing/normalize_values.sql: -------------------------------------------------------------------------------- 1 | {% macro is_dbt_relation(obj) %} 2 | {{ return (obj is mapping and obj.get('metadata', {}).get('type', '').endswith('Relation') )}} 3 | {% endmacro %} 4 | 5 | {% macro normalize_expression_cte(reference_table) %} 6 | with target_table as ( 7 | {% if re_data.is_dbt_relation(reference_table) or reference_table is string %} 8 | select * from {{ reference_table }} 9 | {% elif reference_table is mapping %} 10 | {% for key, value in reference_table.items() %} 11 | select '{{key}}' as source, '{{value}}' as target 12 | {% if not loop.last %}union all{% endif %} 13 | {% endfor %} 14 | {% endif %} 15 | ) 16 | {% endmacro %} 17 | 18 | {%- macro normalize_values(source_relation, column_name, reference_table) -%} 19 | ( 20 | {{ re_data.normalize_expression_cte(reference_table) }} 21 | 22 | select s.*, 23 | case when t.source is null 24 | then s.{{column_name}} 25 | else t.target 26 | end as {{ column_name + '__normalized'}} 27 | from {{ source_relation }} s 28 | left join target_table t 29 | on t.source = s.{{column_name}} 30 | ) 31 | {%- endmacro -%} 32 | 33 | -------------------------------------------------------------------------------- /macros/utils/monitored_config.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% macro columns_in_db(columns) %} 4 | {% set translated = [] %} 5 | {% if columns is none %} 6 | {{ return (none) }} 7 | {% endif %} 8 | {% for col in columns %} 9 | {% do translated.append(re_data.name_in_db(col))%} 10 | {% endfor %} 11 | {{ return (translated) }} 12 | {% endmacro %} 13 | 14 | {% macro metrics_in_db(metrics) %} 15 | {% set translated = metrics %} 16 | {% set column_metrics = {} %} 17 | {% for col in metrics.column %} 18 | {% do column_metrics.update({re_data.name_in_db(col): metrics.column[col]}) %} 19 | {% endfor %} 20 | {% if column_metrics %} 21 | {% do metrics.update({'column': column_metrics}) %} 22 | {% endif %} 23 | {{ return (metrics) }} 24 | {% endmacro %} 25 | 26 | {% macro final_metrics(metrics_groups, additional_metrics) %} 27 | {% set final_metrics_dict = dict([('group', {}), ('additional', {})]) %} 28 | {% set all_metrics_groups = var('re_data:metrics_groups')%} 29 | 30 | {% for group in metrics_groups %} 31 | {% set value = all_metrics_groups.get(group) %} 32 | {% do final_metrics_dict['group'].update(value) %} 33 | {% endfor %} 34 | 35 | {% do final_metrics_dict['additional'].update(additional_metrics) %} 36 | {{ return (final_metrics_dict) }} 37 | 38 | {% endmacro %} -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # What is re_data? 2 | 3 | re_data is an open-source data reliability framework for the modern data stack. 😊 4 | 5 | Currently, re_data focuses on observing the dbt project (together with underlying data warehouse - Postgres, BigQuery, Snowflake, Redshift). 6 | 7 | Data transformations in re_data are implemented and exposed as models & macros in this dbt package. 8 | 9 | # Live demo 10 | 11 | Check out our **[live demo](https://docs.getre.io/ui-latest)** of what re_data can do for you 😊 12 | 13 | # Getting started 14 | 15 | [Check our docs!](https://docs.getre.io/) 🙂 16 | 17 | [Join re_data community on Slack](https://join.slack.com/t/re-data/shared_invite/zt-vkauq1y8-tL4R4_H5nZoVvyXyy0hdug) (we are very responsive there) 18 | 19 | [Check out more info, issues, etc. in master repo](https://github.com/re-data/re-data) 20 | 21 | # Community 22 | 23 | Say, hi to us on! 🙂 24 | 25 | - [Slack](https://join.slack.com/t/re-data/shared_invite/zt-vkauq1y8-tL4R4_H5nZoVvyXyy0hdug) 26 | - [Twitter](https://twitter.com/re_data_labs) 27 | - [LinkedIn](https://www.linkedin.com/company/74608627/) 28 | 29 | # Contributing 30 | 31 | Any contributions are greatly appreciated! Most of our documentation and GitHub issues are managed in the primary [re-data](https://github.com/re-data/re-data) repo. See the Contributing section in [re-data](https://github.com/re-data/re-data) for details. 32 | -------------------------------------------------------------------------------- /macros/metrics/base/build_in/table_default.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro re_data_metric_row_count(context) %} 3 | count(1) 4 | {% endmacro %} 5 | 6 | {% macro re_data_metric_freshness(context) %} 7 | {{ freshness_expression(context.time_filter) }} 8 | {% endmacro %} 9 | 10 | {% macro freshness_expression(time_filter) %} 11 | {# /* If time_filter is none, we are computing the metric globally. we set the value as null since a table without use of a time filter doesn't really have a freshness metric */ #} 12 | {% if time_filter is none %} 13 | cast(null as {{ numeric_type() }}) 14 | {% else %} 15 | {{ adapter.dispatch('freshness_expression', 're_data')(time_filter) }} 16 | {% endif %} 17 | {% endmacro %} 18 | 19 | {% macro default__freshness_expression(time_filter) %} 20 | EXTRACT(EPOCH FROM ({{time_window_end()}} - max({{time_filter}}))) 21 | {% endmacro %} 22 | 23 | {% macro bigquery__freshness_expression(time_filter) %} 24 | TIMESTAMP_DIFF ( timestamp({{ time_window_end()}}), timestamp(max({{time_filter}})), SECOND) 25 | {% endmacro %} 26 | 27 | {% macro snowflake__freshness_expression(time_filter) %} 28 | timediff(second, max({{time_filter}}), {{- time_window_end() -}}) 29 | {% endmacro %} 30 | 31 | {% macro redshift__freshness_expression(time_filter) %} 32 | DATEDIFF(second, max({{time_filter}}), {{- time_window_end() -}}) 33 | {% endmacro %} 34 | -------------------------------------------------------------------------------- /integration_tests/models/transformed/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: buy_events 5 | tests: 6 | - re_data.assert_in_range: 7 | metric: row_count 8 | min_value: 0 9 | max_value: 10 10 | 11 | - re_data.assert_equal: 12 | metric: row_count 13 | value: 1 14 | 15 | - re_data.assert_false: 16 | metric: freshness 17 | expression: value is null 18 | 19 | - re_data.assert_greater_equal: 20 | metric: my_distinct_table_rows 21 | value: 10 22 | 23 | columns: 24 | - name: value1 25 | tests: 26 | - re_data.assert_in_range: 27 | metric: nulls_percent 28 | min_value: 0 29 | max_value: 10 30 | 31 | - re_data.assert_true: 32 | metric: nulls_percent 33 | expression: value = 0 34 | 35 | - re_data.assert_less: 36 | metric: min 37 | value: 100 38 | condition: time_window_start = '2021-05-02' 39 | 40 | - re_data.assert_less_equal: 41 | metric: min 42 | value: 107 43 | 44 | - name: value2 45 | tests: 46 | - re_data.assert_greater_equal: 47 | metric: min 48 | value: 200 49 | condition: time_window_start = '2021-05-02' 50 | -------------------------------------------------------------------------------- /macros/public/cleaning/clean_blacklist.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_blacklist_pattern(chars_to_blacklist) %} 2 | {% set pattern = [] %} 3 | {% for char in chars_to_blacklist %} 4 | {% set expr = '(' + char + ')' %} 5 | {% do pattern.append(expr) %} 6 | {% endfor %} 7 | 8 | {{ return(pattern | join('|')) }} 9 | {% endmacro %} 10 | 11 | {%- macro clean_blacklist(column_name, chars_to_blacklist, replacement) -%} 12 | {% set pattern_string = re_data.generate_blacklist_pattern(chars_to_blacklist) %} 13 | 14 | {{ adapter.dispatch('clean_blacklist', 're_data')(column_name, pattern_string, replacement) }} 15 | {%- endmacro -%} 16 | 17 | {%- macro default__clean_blacklist(column_name, pattern_string, replacement) -%} 18 | regexp_replace( {{ column_name }}, '{{ pattern_string }}', '{{ replacement }}') 19 | {%- endmacro -%} 20 | 21 | {%- macro postgres__clean_blacklist(column_name, pattern_string, replacement) -%} 22 | regexp_replace( {{ column_name }}, '{{ pattern_string }}', '{{ replacement }}', 'g') 23 | {%- endmacro -%} 24 | 25 | {%- macro redshift__clean_blacklist(column_name, pattern_string, replacement) -%} 26 | regexp_replace( {{ column_name }}, '{{ pattern_string }}', '{{ replacement }}') 27 | {%- endmacro -%} 28 | 29 | {%- macro bigquery__clean_blacklist(column_name, pattern_string, replacement) -%} 30 | regexp_replace( {{ column_name }}, """{{ pattern_string }}""", '{{ replacement }}') 31 | {%- endmacro -%} -------------------------------------------------------------------------------- /models/meta/re_data_columns.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='table', 4 | ) 5 | }} 6 | 7 | -- depends_on: {{ ref('re_data_run_started_at') }} 8 | -- depends_on: {{ ref('re_data_monitored') }} 9 | -- depends_on: {{ ref('re_data_selected') }} 10 | 11 | {% if execute %} 12 | {% set schemas = run_query(re_data.get_schemas()) %} 13 | {% if schemas %} 14 | 15 | with columns_from_select as ( 16 | {% for row in schemas %} 17 | {% set schema_name = re_data.name_in_db(re_data.row_value(row, 'schema')) %} 18 | {{ get_monitored_columns(schema_name, re_data.row_value(row, 'database')) }} 19 | {%- if not loop.last %} union all {%- endif %} 20 | {% endfor %} 21 | ) 22 | 23 | select 24 | cast (table_name as {{ string_type() }} ) as name, 25 | cast (table_schema as {{ string_type() }} ) as schema, 26 | cast (table_catalog as {{ string_type() }} ) as database, 27 | cast (column_name as {{ string_type() }} ) as column_name, 28 | cast (data_type as {{ string_type() }} ) as data_type, 29 | cast (case is_nullable when 'YES' then 1 else 0 end as {{ boolean_type() }} ) as is_nullable, 30 | cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as computed_on 31 | from columns_from_select 32 | 33 | {% else %} 34 | {{ empty_columns_table() }} 35 | {% endif %} 36 | 37 | {% else %} 38 | {{ empty_columns_table() }} 39 | {% endif %} 40 | -------------------------------------------------------------------------------- /macros/meta/table_name.sql: -------------------------------------------------------------------------------- 1 | {% macro full_table_name(table_name, table_schema, table_catalog) %} 2 | {{ adapter.dispatch('full_table_name', 're_data')(table_name, table_schema, table_catalog) }} 3 | {% endmacro %} 4 | 5 | 6 | {% macro default__full_table_name(table_name, table_schema, table_catalog) %} 7 | '"' || {{table_catalog}} || '"' || '.' || '"' || {{table_schema}} || '"' || '.' || '"' || {{table_name}} || '"' 8 | {% endmacro %} 9 | 10 | 11 | {% macro bigquery__full_table_name(table_name, table_schema, table_catalog) %} 12 | '`' || {{table_catalog}} || '`' || '.' || '`' || {{table_schema}} || '`' || '.' || '`' || {{table_name}} || '`' 13 | {% endmacro %} 14 | 15 | 16 | {% macro full_table_name_values(table_name, table_schema, table_catalog) %} 17 | {% set result = adapter.dispatch('full_table_name_values', 're_data')(table_name, table_schema, table_catalog) %} 18 | {{ return (result.strip()) }} 19 | {% endmacro %} 20 | 21 | {% macro default__full_table_name_values(table_name, table_schema, table_catalog) %} 22 | "{{table_catalog}}"."{{table_schema}}"."{{table_name}}" 23 | {% endmacro %} 24 | 25 | 26 | {% macro bigquery__full_table_name_values(table_name, table_schema, table_catalog) %} 27 | `{{table_catalog}}`.`{{table_schema}}`.`{{table_name}}` 28 | {% endmacro %} 29 | 30 | 31 | {% macro snowflake__full_table_name_values(table_name, table_schema, table_catalog) %} 32 | "{{table_catalog|upper}}"."{{table_schema|upper}}"."{{table_name|upper}}" 33 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/seeds/public_macros/normalizing/expected_us_states_normalized.csv: -------------------------------------------------------------------------------- 1 | "state","code","state__normalized" 2 | "Ariz.","AZ","Arizona" 3 | "Ind.","IN","Indiana" 4 | "La.","LA","Louisiana" 5 | "W.Va.","WV","West Virginia" 6 | "Nebr.","NE","Nebraska" 7 | "Pa.","PA","Pennsylvania" 8 | "Iowa","IA","Iowa" 9 | "N.H.","NH","New Hampshire" 10 | "S.C.","SC","South Carolina" 11 | "Ore.","OR","Oregon" 12 | "Conn.","CT","Connecticut" 13 | "R.I.","RI","Rhode Island" 14 | "Minn.","MN","Minnesota" 15 | "D.C.","DC","District of Columbia" 16 | "Wyo.","WY","Wyoming" 17 | "Hawaii","HI","Hawaii" 18 | "Wash.","WA","Washington" 19 | "N.D.","ND","North Dakota" 20 | "Mass.","MA","Massachusetts" 21 | "N.Y.","NY","New York" 22 | "N.M.","NM","New Mexico" 23 | "Colo.","CO","Colorado" 24 | "Ohio","OH","Ohio" 25 | "Idaho","ID","Idaho" 26 | "Ala.","AL","Alabama" 27 | "Ark.","AR","Arkansas" 28 | "S.D.","SD","South Dakota" 29 | "Mo.","MO","Missouri" 30 | "N.J.","NJ","New Jersey" 31 | "Miss.","MS","Mississippi" 32 | "Kans.","KS","Kansas" 33 | "Vt.","VT","Vermont" 34 | "Calif.","CA","California" 35 | "Mich.","MI","Michigan" 36 | "Alaska","AK","Alaska" 37 | "Nev.","NV","Nevada" 38 | "Okla.","OK","Oklahoma" 39 | "Tenn.","TN","Tennessee" 40 | "Ga.","GA","Georgia" 41 | "Wis.","WI","Wisconsin" 42 | "Ky.","KY","Kentucky" 43 | "N.C.","NC","North Carolina" 44 | "Mont.","MT","Montana" 45 | "Fla.","FL","Florida" 46 | "Va.","VA","Virginia" 47 | "Tex.","TX","Texas" 48 | "Md.","MD","Maryland" 49 | "Utah","UT","Utah" 50 | "Maine","ME","Maine" 51 | "Del.","DE","Delaware" 52 | "Ill.","IL","Illinois" 53 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 redata-team 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | 24 | ======================================================================== 25 | Third party Apache 2.0 licenses 26 | ======================================================================== 27 | 28 | The following files contain signifant parts of code licensed under third party Apache 2.0 License. 29 | 30 | macros/public/validating/regex_dict.sql 31 | macros/utils/fivetran_utils/percentile.sql 32 | macros/utils/fivetran_utils/json_extract.sql 33 | 34 | 35 | -------------------------------------------------------------------------------- /.github/workflows/generate-docs.yml: -------------------------------------------------------------------------------- 1 | name: Generate Documentation 2 | 3 | on: [push] 4 | 5 | env: 6 | DBT_PROFILES_DIR: ./ 7 | DBT_VERSION: 1.7 8 | PYTHON_VERSION: "3.8.x" 9 | 10 | jobs: 11 | generate-docs: 12 | runs-on: ubuntu-latest 13 | if: github.event_name == 'push' 14 | services: 15 | postgres: 16 | image: postgres 17 | env: 18 | POSTGRES_PASSWORD: postgres 19 | # Set health checks to wait until postgres has started 20 | options: >- 21 | --health-cmd pg_isready 22 | --health-interval 10s 23 | --health-timeout 5s 24 | --health-retries 5 25 | ports: 26 | # Maps tcp port 5432 on service container to the host 27 | - 5432:5432 28 | steps: 29 | - name: Check out 30 | uses: actions/checkout@v2 31 | 32 | - uses: actions/setup-python@v4 33 | with: 34 | python-version: ${{ env.PYTHON_VERSION }} 35 | 36 | - name: Install dependencies and run 37 | run: | 38 | pip install protobuf==4.25.3 dbt-postgres==$DBT_VERSION 39 | dbt deps 40 | dbt run 41 | 42 | - name: Generate Documentation 43 | run: dbt docs generate 44 | 45 | - name: Copy files 46 | if: github.ref == 'refs/heads/main' 47 | run: 'mkdir docs && cp target/{catalog.json,index.html,manifest.json,run_results.json} docs/' 48 | shell: bash 49 | 50 | - name: Deploy 51 | uses: peaceiris/actions-gh-pages@v3 52 | if: github.ref == 'refs/heads/main' && !env.ACT 53 | with: 54 | github_token: ${{ secrets.GITHUB_TOKEN }} 55 | publish_dir: ./docs 56 | -------------------------------------------------------------------------------- /integration_tests/macros/drop_all_schemas.sql: -------------------------------------------------------------------------------- 1 | {% macro get_schemas_used(schema_name) %} 2 | {% set schemas = [ 3 | schema_name, 4 | schema_name + '_re', 5 | schema_name + '_re_internal', 6 | schema_name + '_raw', 7 | schema_name + '_expected', 8 | schema_name + '_dbt_test__audit', 9 | schema_name + '_seeds' 10 | ] %} 11 | {{ return (schemas) }} 12 | {% endmacro %} 13 | 14 | {% macro drop_all_schemas(schema_name) %} 15 | {% set schemas_to_drop = get_schemas_used(schema_name) %} 16 | {{ adapter.dispatch('drop_all_schemas')(schemas_to_drop) }} 17 | {% endmacro %} 18 | 19 | {% macro default__drop_all_schemas(schemas_to_drop) %} 20 | {% for schema in schemas_to_drop %} 21 | {% set relation = api.Relation.create(database=target.database, schema=schema) %} 22 | {% do adapter.drop_schema(relation) %} 23 | {% endfor %} 24 | {% endmacro %} 25 | 26 | {% macro redshift__drop_all_schemas(schemas_to_drop) %} 27 | {# 28 | dropping schemas with adapter.drop_schema doesn't seem to work with redshift 29 | so we default to issuing DDL commands to redshift 30 | #} 31 | {% set drop_query %} 32 | {% for schema in schemas_to_drop %} 33 | drop schema if exists {{schema}} cascade; 34 | {% endfor %} 35 | {% endset %} 36 | {% do run_query(drop_query) %} 37 | {% endmacro %} 38 | 39 | {% macro create_required_schemas(schema_name) %} 40 | {# required to manually create schemas used for redshift tests #} 41 | {% set schemas_to_drop = get_schemas_used(schema_name) %} 42 | {% set create_query %} 43 | {% for schema in schemas_to_drop %} 44 | create schema if not exists {{schema}}; 45 | {% endfor %} 46 | {% endset %} 47 | {% do run_query(create_query) %} 48 | {% endmacro %} -------------------------------------------------------------------------------- /macros/utils/fivetran_utils/percentile.sql: -------------------------------------------------------------------------------- 1 | {# 2 | # This file contains significant part of code derived from 3 | # https://github.com/fivetran/dbt_fivetran_utils/tree/v0.4.0 which is licensed under Apache License 2.0. 4 | #} 5 | 6 | {% macro percentile(percentile_field, partition_field, percent) -%} 7 | 8 | {{ adapter.dispatch('percentile','re_data') (percentile_field, partition_field, percent) }} 9 | 10 | {%- endmacro %} 11 | 12 | --percentile calculation specific to Redshift 13 | {% macro default__percentile(percentile_field, partition_field, percent) %} 14 | 15 | percentile_cont( 16 | {{ percent }} ) 17 | within group ( order by {{ percentile_field }} ) 18 | over ( partition by {{ partition_field }} ) 19 | 20 | {% endmacro %} 21 | 22 | --percentile calculation specific to Redshift 23 | {% macro redshift__percentile(percentile_field, partition_field, percent) %} 24 | 25 | percentile_cont( 26 | {{ percent }} ) 27 | within group ( order by {{ percentile_field }} ) 28 | over ( partition by {{ partition_field }} ) 29 | 30 | {% endmacro %} 31 | 32 | --percentile calculation specific to BigQuery 33 | {% macro bigquery__percentile(percentile_field, partition_field, percent) %} 34 | 35 | percentile_cont( 36 | {{ percentile_field }}, 37 | {{ percent }}) 38 | over (partition by {{ partition_field }} 39 | ) 40 | 41 | {% endmacro %} 42 | 43 | {% macro postgres__percentile(percentile_field, partition_field, percent) %} 44 | 45 | percentile_cont( 46 | {{ percent }} ) 47 | within group ( order by {{ percentile_field }} ) 48 | /* have to group by partition field */ 49 | 50 | {% endmacro %} 51 | 52 | {% macro spark__percentile(percentile_field, partition_field, percent) %} 53 | 54 | percentile( 55 | {{ percentile_field }}, 56 | {{ percent }}) 57 | over (partition by {{ partition_field }} 58 | ) 59 | 60 | {% endmacro %} 61 | -------------------------------------------------------------------------------- /dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: "re_data" 2 | version: "0.12.0" 3 | config-version: 2 4 | 5 | require-dbt-version: [">=1.0.0", "<2.0.0"] 6 | 7 | profile: "re_data_postgres" 8 | 9 | target-path: "target" # directory which will store compiled SQL files 10 | clean-targets: ["target", "dbt_modules", "dbt_packages"] 11 | 12 | on-run-end: 13 | - "{% if var('re_data:save_test_history') %} {{ re_data.save_test_history(results) }} {% endif %}" 14 | 15 | vars: 16 | re_data:max_columns_in_query: 10 17 | re_data:time_window_end: '{{ run_started_at.strftime("%Y-%m-%d 00:00:00") }}' 18 | re_data:time_window_start: '{{ (run_started_at - modules.datetime.timedelta(1)).strftime("%Y-%m-%d 00:00:00") }}' 19 | re_data:anomaly_detection_look_back_days: 30 20 | re_data:select: null 21 | re_data:re_data_anomalies_filtered: re_data_anomalies 22 | 23 | re_data:alerting_z_score: 3 24 | 25 | re_data:save_test_history: false 26 | re_data:show_warns_as_alerts: false 27 | 28 | re_data:anomaly_detector: 29 | name: modified_z_score 30 | threshold: 3 31 | 32 | re_data:store_table_samples: false 33 | 34 | re_data:metrics_groups: 35 | table_metrics: 36 | table: 37 | - row_count 38 | - freshness 39 | 40 | column_metrics: 41 | column: 42 | numeric: 43 | - min 44 | - max 45 | - avg 46 | - stddev 47 | - variance 48 | - nulls_count 49 | - nulls_percent 50 | text: 51 | - min_length 52 | - max_length 53 | - avg_length 54 | - nulls_count 55 | - missing_count 56 | - nulls_percent 57 | - missing_percent 58 | boolean: 59 | - count_true 60 | - count_false 61 | - nulls_count 62 | - nulls_percent 63 | 64 | re_data:default_metrics: 65 | - table_metrics 66 | - column_metrics 67 | 68 | models: 69 | re_data: 70 | +schema: re 71 | internal: 72 | +schema: re_internal 73 | -------------------------------------------------------------------------------- /macros/utils/mock/empty_tables.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro all_types_select() %} 3 | with types_table as ( 4 | select 5 | cast (null as {{ string_type() }}) as string_type, 6 | cast (null as {{ long_string_type() }}) as long_string_type, 7 | cast (1 as {{ numeric_type() }}) as numeric_type, 8 | cast ('2000-01-10' as {{ timestamp_type() }}) as timestamp_type, 9 | cast (true as {{ boolean_type() }}) as boolean_type 10 | ) 11 | {% endmacro %} 12 | 13 | {% macro empty_table_generic(list) %} 14 | {{ re_data.all_types_select() }} 15 | select 16 | {% for name, type in list %} 17 | {{ type }}_type as {{ name }} 18 | {%- if not loop.last %}, {%- endif %} 19 | {% endfor %} 20 | from types_table 21 | where string_type is not null 22 | {% endmacro %} 23 | 24 | {% macro empty_last_base_metrics() %} 25 | {{ 26 | re_data.empty_table_generic([ 27 | ('table_name', 'string'), 28 | ('column_name', 'string'), 29 | ('metric', 'string'), 30 | ('value', 'numeric') 31 | ]) 32 | }} 33 | {% endmacro %} 34 | 35 | {% macro empty_last_table_samples() %} 36 | {{ 37 | re_data.empty_table_generic([ 38 | ('table_name', 'string'), 39 | ('sample_data', 'string') 40 | ]) 41 | }} 42 | {% endmacro %} 43 | 44 | {% macro empty_columns_table() %} 45 | {{ 46 | re_data.empty_table_generic([ 47 | ('name', 'string'), 48 | ('schema', 'string'), 49 | ('database', 'string'), 50 | ('column_name', 'string'), 51 | ('data_type', 'string'), 52 | ('is_nullable', 'boolean'), 53 | ('time_filter', 'string'), 54 | ('computed_on', 'timestamp') 55 | ]) 56 | }} 57 | {% endmacro %} 58 | 59 | 60 | {% macro empty_table() %} 61 | {{ 62 | re_data.empty_table_generic([ 63 | ('name', 'string') 64 | ]) 65 | }} 66 | {% endmacro %} 67 | 68 | -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/expected_test_history.csv: -------------------------------------------------------------------------------- 1 | table_name,column_name,test_name,status,message,failures_count,severity 2 | TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR 3 | TEST_RE_DATA_TABLE_SAMPLES,---,_table_samples_,Pass,---,0,ERROR 4 | TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR 5 | TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR 6 | BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR 7 | BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR 8 | BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR 9 | BUY_EVENTS,VALUE2,5_02___min__200,Pass,---,0,ERROR 10 | BUY_EVENTS,VALUE1,alue1__min__107,Pass,---,0,ERROR 11 | BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR 12 | BUY_EVENTS,VALUE1,5_02___min__100,Pass,---,0,ERROR 13 | BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR 14 | RE_DATA_METRICS,---,_buy_events___3,Pass,---,0,ERROR 15 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR 16 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR 17 | BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR 18 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR 19 | TEST_RE_DATA_TABLE_SAMPLES,---,_table_samples_,Pass,---,0,ERROR 20 | TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR 21 | BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR 22 | BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR 23 | BUY_EVENTS,VALUE2,5_02___min__200,Pass,---,0,ERROR 24 | BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR 25 | BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR 26 | BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR 27 | BUY_EVENTS,VALUE1,5_02___min__100,Pass,---,0,ERROR 28 | BUY_EVENTS,VALUE1,alue1__min__107,Pass,---,0,ERROR 29 | BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR 30 | RE_DATA_METRICS,---,_buy_events___3,Pass,---,0,ERROR 31 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR 32 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR 33 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR 34 | TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR 35 | TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR 36 | -------------------------------------------------------------------------------- /macros/store/insert_list_to_table.sql: -------------------------------------------------------------------------------- 1 | {% macro insert_list_to_table(table, list, params, dtype=None,insert_size=100) %} 2 | 3 | {% set single_insert_list = [] %} 4 | {% for el in list %} 5 | {% do single_insert_list.append(el) %} 6 | {% set single_insert_list_size = single_insert_list | length %} 7 | {% if single_insert_list_size == insert_size or loop.last %} 8 | 9 | {% set insert_query %} 10 | insert into {{ table }} ({%- for p in params %}{{p}}{% if not loop.last %}, {% endif %}{% endfor %}) values 11 | {%- for row in single_insert_list -%} 12 | ( 13 | {%- for p in params -%} 14 | {%- if row[p] is none -%} 15 | NULL 16 | {%- else -%} 17 | {%- if row[p] is string -%} 18 | {%- if dtype and p in dtype -%} 19 | {% set cast_type = dtype[p] %} 20 | cast ({{ re_data.quote_string(row[p]) }} as {{ cast_type }}) 21 | {%- else %} 22 | {{- re_data.quote_string(row[p]) -}} 23 | {%- endif -%} 24 | {%- elif row[p] is number -%} 25 | {{-row[p]-}} 26 | {%- else -%} 27 | {{- re_data.quote_string(tojson(row[p])) -}} 28 | {%- endif -%} 29 | {%- endif -%} 30 | {%- if not loop.last -%},{%- endif -%} 31 | {%- endfor -%} 32 | ) 33 | {%- if not loop.last -%},{%- endif %} 34 | {% endfor -%} 35 | {% endset %} 36 | 37 | {% do run_query(insert_query) %} 38 | {% do single_insert_list.clear() %} 39 | {% endif %} 40 | {% endfor %} 41 | 42 | {% endmacro %} 43 | -------------------------------------------------------------------------------- /macros/config/get_model_config.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro get_model_config(monitored) %} 3 | {% set model = {} %} 4 | {% do model.update({'name': re_data.row_value(monitored, 'name')}) %} 5 | {% do model.update({'schema': re_data.row_value(monitored, 'schema')}) %} 6 | {% do model.update({'database': re_data.row_value(monitored, 'database')}) %} 7 | {% do model.update({'time_filter': re_data.row_value(monitored, 'time_filter')}) %} 8 | {% do model.update({'metrics': fromjson(re_data.row_value(monitored, 'metrics'))}) %} 9 | {% do model.update({'model_name': model.get('database') + '.' + model.get('schema') + '.' + model.get('name')}) %} 10 | {% do model.update({'table_name': full_table_name_values(model.get('name'), model.get('schema'), model.get('database'))}) %} 11 | 12 | {% set columns_db = re_data.row_value(monitored, 'columns') %} 13 | 14 | {% set column_list = fromjson(columns_db) if columns_db is not none else none %} 15 | {% set columns_dict = re_data.dict_from_list(column_list) %} 16 | 17 | {% do model.update({'columns_dict': columns_dict}) %} 18 | {% do model.update({'columns_compute_all': columns_dict is none}) %} 19 | 20 | {% set columns_query %} 21 | select * from {{ ref('re_data_columns') }} 22 | where name = '{{ model.name }}' and schema = '{{ model.schema }}' and database = '{{ model.database }}' 23 | {% endset %} 24 | {% set columns = run_query(columns_query) %} 25 | 26 | {% set columns_info = {} %} 27 | {% for col in columns %} 28 | {% set column_name = re_data.row_value(col, 'column_name') %} 29 | {% set data_type = re_data.get_column_type(col) %} 30 | {% do columns_info.update({column_name: { 'data_type': data_type }}) %} 31 | {% endfor %} 32 | 33 | {% do model.update({'columns_info': columns_info}) %} 34 | {% do model.update({'columns': columns}) %} 35 | 36 | {{ return(model) }} 37 | {% endmacro %} 38 | 39 | {% macro should_compute_metric(model, column_name) %} 40 | {{ return(model.columns_compute_all or model.columns_dict.get(column_name)) }} 41 | {% endmacro %} -------------------------------------------------------------------------------- /macros/samples/internal_model_template.sql: -------------------------------------------------------------------------------- 1 | {%- macro order_by_if_time_filter(time_filter) -%} 2 | {%- if time_filter is not none -%} 3 | order by {{ time_filter }} desc 4 | {%- endif -%} 5 | {%- endmacro -%} 6 | 7 | 8 | {% macro re_data_last_table_samples() %} 9 | {{ re_data.generate_depends(['re_data_selected', 're_data_monitored', 're_data_columns', 're_data_run_started_at', 're_data_last_table_samples_part']) }} 10 | 11 | {{ 12 | config( 13 | materialized='table', 14 | ) 15 | }} 16 | 17 | {% if var.has_var('re_data:store_table_samples') %} 18 | {% set store_samples = var('re_data:store_table_samples') %} 19 | {% endif %} 20 | {% if not re_data.in_compile() and store_samples is sameas true %} 21 | {%- set tables = run_query(re_data.get_tables()) %} 22 | 23 | {% set samples_list = [] %} 24 | {%- for sample_table in tables %} 25 | 26 | {% set model = get_model_config(sample_table) %} 27 | {% set columns_to_sample = [] %} 28 | {% for key, value in model.columns_info.items() | sort %} 29 | {% if value.data_type in ['numeric', 'text'] %} 30 | {% do columns_to_sample.append(key) %} 31 | {% endif %} 32 | {% endfor %} 33 | 34 | {% set samples_query %} 35 | select {{ print_list(columns_to_sample)}} from {{ model.table_name }} 36 | {{ order_by_if_time_filter(model.time_filter) }} 37 | limit 10 38 | {% endset %} 39 | 40 | {% set samples = re_data.agate_to_list(run_query(samples_query)) %} 41 | {% do samples_list.append({ 42 | 'table_name': model.model_name, 43 | 'sample_data': samples, 44 | }) %} 45 | 46 | {% endfor %} 47 | {% do re_data.insert_list_to_table( 48 | ref('re_data_last_table_samples_part'), 49 | samples_list, 50 | ['table_name', 'sample_data'] 51 | ) %} 52 | {% endif %} 53 | 54 | {{ re_data.empty_last_table_samples() }} 55 | 56 | {% endmacro %} -------------------------------------------------------------------------------- /profiles.yml: -------------------------------------------------------------------------------- 1 | re_data_postgres: 2 | target: dev 3 | outputs: 4 | dev: 5 | type: postgres 6 | host: localhost 7 | user: postgres 8 | password: postgres 9 | port: 5432 10 | dbname: postgres 11 | schema: dq 12 | threads: 4 13 | re_data_snowflake: 14 | target: dev 15 | outputs: 16 | dev: 17 | type: snowflake 18 | account: "{{ env_var('SNOWFLAKE_RE_DATA_TESTING_ACCOUNT') }}" 19 | user: "{{ env_var('RE_DATA_TESTING_USER') }}" 20 | password: "{{ env_var('RE_DATA_TESTING_PASSWORD') }}" 21 | database: RE_DATA_TESTING 22 | warehouse: RE_DATA_TESTING_DWH 23 | schema: "{{ env_var('DQ_SCHEMA') }}" 24 | threads: 4 25 | re_data_redshift: 26 | target: dev 27 | outputs: 28 | dev: 29 | type: redshift 30 | host: "{{ env_var('REDSHIFT_RE_DATA_TESTING_HOST') }}" 31 | user: "{{ env_var('RE_DATA_TESTING_USER') }}" 32 | password: "{{ env_var('RE_DATA_TESTING_PASSWORD') }}" 33 | port: 5439 34 | dbname: re_data_testing 35 | schema: "{{ env_var('DQ_SCHEMA') }}" 36 | threads: 4 37 | re_data_bigquery: 38 | target: dev 39 | outputs: 40 | dev: 41 | type: bigquery 42 | method: service-account-json 43 | project: "{{ env_var('BIGQUERY_TESTING_PROJECT_ID') }}" 44 | dataset: "{{ env_var('DQ_SCHEMA') }}" 45 | threads: 4 46 | keyfile_json: 47 | type: "{{ env_var('BIGQUERY_TESTING_TYPE') }}" 48 | project_id: "{{ env_var('BIGQUERY_TESTING_PROJECT_ID') }}" 49 | private_key_id: "{{ env_var('BIGQUERY_TESTING_PRIVATE_KEY_ID') }}" 50 | private_key: "{{ env_var('BIGQUERY_TESTING_PRIVATE_KEY') }}" 51 | client_email: "{{ env_var('BIGQUERY_TESTING_CLIENT_EMAIL') }}" 52 | client_id: "{{ env_var('BIGQUERY_TESTING_CLIENT_ID') }}" 53 | auth_uri: "{{ env_var('BIGQUERY_TESTING_AUTH_URI') }}" 54 | token_uri: "{{ env_var('BIGQUERY_TESTING_TOKEN_URI') }}" 55 | auth_provider_x509_cert_url: "{{ env_var('BIGQUERY_TESTING_AUTH_PROVIDER_X509_CERT_URL') }}" 56 | client_x509_cert_url: "{{ env_var('BIGQUERY_TESTING_CLIENT_X509_CERT_URL') }}" 57 | location: US 58 | timeout_seconds: 300 59 | priority: interactive 60 | retries: 1 -------------------------------------------------------------------------------- /models/metrics/types/base/re_data_base_metrics.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key = 'id', 5 | on_schema_change='sync_all_columns', 6 | ) 7 | }} 8 | 9 | -- depends_on: {{ ref('re_data_columns') }} 10 | -- depends_on: {{ ref('re_data_last_base_metrics_thread0') }} 11 | -- depends_on: {{ ref('re_data_last_base_metrics_thread1') }} 12 | -- depends_on: {{ ref('re_data_last_base_metrics_thread2') }} 13 | -- depends_on: {{ ref('re_data_last_base_metrics_thread3') }} 14 | -- depends_on: {{ ref('re_data_last_base_metrics_part0') }} 15 | -- depends_on: {{ ref('re_data_last_base_metrics_part1') }} 16 | -- depends_on: {{ ref('re_data_last_base_metrics_part2') }} 17 | -- depends_on: {{ ref('re_data_last_base_metrics_part3') }} 18 | -- depends_on: {{ ref('re_data_run_started_at') }} 19 | -- depends_on: {{ ref('re_data_monitored') }} 20 | -- depends_on: {{ ref('re_data_selected') }} 21 | 22 | with 23 | 24 | with_time_window as ( 25 | {% set parts = ['0','1','2','3'] %} 26 | {% for part in parts %} 27 | {% set ref_name = 're_data_last_base_metrics_part' + part %} 28 | select 29 | *, 30 | {{ time_window_start() }} as time_window_start, 31 | {{ time_window_end() }} as time_window_end 32 | from {{ ref(ref_name) }} 33 | {%- if not loop.last %} union all {%- endif %} 34 | {% endfor %} 35 | ) 36 | select 37 | cast ({{ dbt_utils.generate_surrogate_key([ 38 | 'table_name', 39 | 'column_name', 40 | 'metric', 41 | 'time_window_start', 42 | 'time_window_end' 43 | ]) }} as {{ string_type() }} ) as id, 44 | cast (table_name as {{ string_type() }} ) as table_name, 45 | cast (column_name as {{ string_type() }} ) as column_name, 46 | cast (metric as {{ string_type() }} ) as metric, 47 | cast (value as {{ numeric_type() }} ) as value, 48 | cast (time_window_start as {{ timestamp_type() }} ) as time_window_start, 49 | cast (time_window_end as {{ timestamp_type() }} ) as time_window_end, 50 | cast ( 51 | {{ interval_length_sec('time_window_start', 'time_window_end') }} as {{ integer_type() }} 52 | ) as interval_length_sec, 53 | cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as computed_on 54 | from with_time_window 55 | -------------------------------------------------------------------------------- /integration_tests/models/public_macros/normalizing/us_states_normalized.sql: -------------------------------------------------------------------------------- 1 | with us_states_normalization_cte as ( 2 | select source, target from {{ ref('us_states_normalization') }} 3 | ) 4 | 5 | {% set us_states_mapping = {'Ala.': 'Alabama', 'Alaska': 'Alaska', 'Ariz.': 'Arizona', 'Ark.': 'Arkansas', 'Calif.': 'California', 'Colo.': 'Colorado', 'Conn.': 'Connecticut', 6 | 'Del.': 'Delaware', 'D.C.': 'District of Columbia', 'Fla.': 'Florida', 'Ga.': 'Georgia', 'Hawaii': 'Hawaii', 'Idaho': 'Idaho', 'Ill.': 'Illinois', 'Ind.': 'Indiana', 7 | 'Iowa': 'Iowa', 'Kans.': 'Kansas', 'Ky.': 'Kentucky', 'La.': 'Louisiana', 'Maine': 'Maine', 'Md.': 'Maryland', 'Mass.': 'Massachusetts', 'Mich.': 'Michigan', 8 | 'Minn.': 'Minnesota', 'Miss.': 'Mississippi', 'Mo.': 'Missouri', 'Mont.': 'Montana', 'Nebr.': 'Nebraska', 'Nev.': 'Nevada', 'N.H.': 'New Hampshire', 'N.J.': 'New Jersey', 9 | 'N.M.': 'New Mexico', 'N.Y.': 'New York', 'N.C.': 'North Carolina', 'N.D.': 'North Dakota', 'Ohio': 'Ohio', 'Okla.': 'Oklahoma', 'Ore.': 'Oregon', 'Pa.': 'Pennsylvania', 10 | 'R.I.': 'Rhode Island', 'S.C.': 'South Carolina', 'S.D.': 'South Dakota', 'Tenn.': 'Tennessee', 'Tex.': 'Texas', 'Utah': 'Utah', 'Vt.': 'Vermont', 'Va.': 'Virginia', 11 | 'Wash.': 'Washington', 'W.Va.': 'West Virginia', 'Wis.': 'Wisconsin', 'Wyo.': 'Wyoming'} 12 | %} 13 | 14 | 15 | {# 16 | We have three ways of passing the source used for normalization 17 | 1. passing a dbt model using ref('') which is of type Relation. 18 | 2. passing a common table expression that contains the source mapping 19 | Note: model or cte must include "source" and "target" column names used for normalization in 1. & 2. repectively 20 | 3. passing a dictionary of values that map from source -> target ie {[source]: [target]} 21 | #} 22 | 23 | select distinct * from ( 24 | select state, code, state__normalized from {{ re_data.normalize_values(ref('abbreviated_us_states'), 'state', ref('us_states_normalization')) }} s 25 | union all 26 | select state, code, state__normalized from {{ re_data.normalize_values(ref('abbreviated_us_states'), 'state', 'us_states_normalization_cte') }} s 27 | union all 28 | select state, code, state__normalized from {{ re_data.normalize_values(ref('abbreviated_us_states'), 'state', us_states_mapping) }} s 29 | ) as normalized -------------------------------------------------------------------------------- /macros/utils/used_types.sql: -------------------------------------------------------------------------------- 1 | {% macro timestamp_type() %} 2 | {{ adapter.dispatch('timestamp_type', 're_data')() }} 3 | {% endmacro %} 4 | 5 | {% macro default__timestamp_type() %} 6 | timestamp without time zone 7 | {% endmacro %} 8 | 9 | {% macro redshift__timestamp_type() %} 10 | TIMESTAMP 11 | {% endmacro %} 12 | 13 | {% macro bigquery__timestamp_type() %} 14 | TIMESTAMP 15 | {% endmacro %} 16 | 17 | {% macro snowflake__timestamp_type() %} 18 | TIMESTAMP_NTZ 19 | {% endmacro %} 20 | 21 | {% macro string_type() %} 22 | {{ adapter.dispatch('string_type', 're_data')() }} 23 | {% endmacro %} 24 | 25 | {% macro default__string_type() %} 26 | text 27 | {% endmacro %} 28 | 29 | {% macro redshift__string_type() %} 30 | varchar(2047) 31 | {% endmacro %} 32 | 33 | {% macro bigquery__string_type() %} 34 | STRING 35 | {% endmacro %} 36 | 37 | {% macro snowflake__string_type() %} 38 | STRING 39 | {% endmacro %} 40 | 41 | {% macro long_string_type() %} 42 | {{ adapter.dispatch('long_string_type', 're_data')() }} 43 | {% endmacro %} 44 | 45 | {% macro default__long_string_type() %} 46 | {{ re_data.string_type() }} 47 | {% endmacro %} 48 | 49 | {% macro redshift__long_string_type() %} 50 | varchar(65535) 51 | {% endmacro %} 52 | 53 | {% macro integer_type() %} 54 | INTEGER 55 | {% endmacro %} 56 | 57 | 58 | {% macro boolean_type() %} 59 | {{ adapter.dispatch('boolean_type', 're_data')() }} 60 | {% endmacro %} 61 | 62 | {% macro default__boolean_type() %} 63 | BOOLEAN 64 | {% endmacro %} 65 | 66 | {% macro redshift__boolean_type() %} 67 | boolean 68 | {% endmacro %} 69 | 70 | {% macro bigquery__boolean_type() %} 71 | BOOLEAN 72 | {% endmacro %} 73 | 74 | {% macro snowflake__boolean_type() %} 75 | BOOLEAN 76 | {% endmacro %} 77 | 78 | 79 | {% macro numeric_type() %} 80 | {{ adapter.dispatch('numeric_type', 're_data')() }} 81 | {% endmacro %} 82 | 83 | {% macro default__numeric_type() %} 84 | double precision 85 | {% endmacro %} 86 | 87 | {% macro redshift__numeric_type() %} 88 | DOUBLE PRECISION 89 | {% endmacro %} 90 | 91 | {% macro bigquery__numeric_type() %} 92 | FLOAT64 93 | {% endmacro %} 94 | 95 | {% macro snowflake__numeric_type() %} 96 | FLOAT 97 | {% endmacro %} 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /macros/utils/column_types.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro get_column_type(column) %} 3 | {% set result = adapter.dispatch('get_column_type', 're_data')(column) %} 4 | {{ return(result) }} 5 | {% endmacro %} 6 | 7 | 8 | {% macro default__get_column_type(column) %} 9 | 10 | {% if column.data_type in [ 11 | 'character varying', 12 | 'varchar', 13 | 'character', 14 | 'char', 15 | 'text' 16 | ] %} 17 | {{ return('text') }} 18 | 19 | {% elif column.data_type in [ 20 | 'smallint', 21 | 'integer', 22 | 'bigint', 23 | 'decimal', 24 | 'numeric', 25 | 'real', 26 | 'double precision', 27 | 'enum', 28 | ] %} 29 | {{ return('numeric') }} 30 | 31 | {% elif column.data_type in [ 'boolean', 'bool' ] %} 32 | 33 | {{ return('boolean') }} 34 | 35 | {% else %} 36 | {{ return('unknown') }} 37 | 38 | {% endif %} 39 | 40 | {% endmacro %} 41 | 42 | 43 | {% macro snowflake__get_column_type(column) %} 44 | 45 | {% if column.DATA_TYPE in [ 46 | 'VARCHAR', 47 | 'CHAR', 48 | 'CHARACTER', 49 | 'STRING', 50 | 'TEXT' 51 | ] %} 52 | 53 | {{ return('text') }} 54 | 55 | {% elif column.DATA_TYPE in [ 56 | 'NUMBER', 57 | 'DECIMAL', 58 | 'NUMERIC', 59 | 'INT', 60 | 'INTEGER', 61 | 'BIGINT', 62 | 'SMALLINT', 63 | 'TINYINT', 64 | 'BYTEINT', 65 | 'FLOAT', 66 | 'FLOAT4', 67 | 'FLOAT8', 68 | 'DOUBLE', 69 | 'DOUBLE PRECISION', 70 | 'REAL', 71 | ] %} 72 | 73 | {{ return('numeric') }} 74 | 75 | {% elif column.data_type in [ 'BOOLEAN' ] %} 76 | 77 | {{ return('boolean') }} 78 | 79 | {% else %} 80 | 81 | {{ return('unknown') }} 82 | 83 | {% endif %} 84 | 85 | {% endmacro %} 86 | 87 | 88 | {% macro bigquery__get_column_type(column) %} 89 | 90 | {% if column.data_type in [ 'STRING' ] %} 91 | {{ return('text') }} 92 | 93 | {% elif column.data_type in [ "INT64", "NUMERIC", "BIGNUMERIC", "FLOAT64", "INTEGER"] %} 94 | {{ return('numeric') }} 95 | 96 | {% elif column.data_type in [ "BOOLEAN", "BOOL"] %} 97 | {{ return('boolean') }} 98 | {% else %} 99 | {{ return('unknown') }} 100 | 101 | {% endif %} 102 | {% endmacro %} 103 | -------------------------------------------------------------------------------- /macros/metrics/base/build_in/column_default.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro re_data_metric_max(context) %} 3 | max({{context.column_name}}) 4 | {% endmacro %} 5 | 6 | {% macro re_data_metric_min(context) %} 7 | min({{context.column_name}}) 8 | {% endmacro %} 9 | 10 | {% macro re_data_metric_avg(context) %} 11 | avg(cast ({{context.column_name}} as {{ numeric_type() }})) 12 | {% endmacro %} 13 | 14 | {% macro re_data_metric_stddev(context) %} 15 | stddev(cast ( {{context.column_name}} as {{ numeric_type() }})) 16 | {% endmacro %} 17 | 18 | {% macro re_data_metric_variance(context) %} 19 | variance(cast ( {{context.column_name}} as {{ numeric_type() }})) 20 | {% endmacro %} 21 | 22 | {% macro re_data_metric_max_length(context) %} 23 | max(length({{context.column_name}})) 24 | {% endmacro %} 25 | 26 | {% macro re_data_metric_min_length(context) %} 27 | min(length({{context.column_name}})) 28 | {% endmacro %} 29 | 30 | {% macro re_data_metric_avg_length(context) %} 31 | avg(cast (length( {{context.column_name}} ) as {{ numeric_type() }})) 32 | {% endmacro %} 33 | 34 | {% macro re_data_metric_nulls_count(context) %} 35 | coalesce( 36 | sum( 37 | case when {{context.column_name}} is null 38 | then 1 39 | else 0 40 | end 41 | ), 0 42 | ) 43 | {% endmacro %} 44 | 45 | {% macro re_data_metric_missing_count(context) %} 46 | coalesce( 47 | sum( 48 | case 49 | when {{context.column_name}} is null 50 | then 1 51 | when {{context.column_name}} = '' 52 | then 1 53 | else 0 54 | end 55 | ), 0 56 | ) 57 | {% endmacro %} 58 | 59 | {% macro re_data_metric_nulls_percent(context) %} 60 | {{ percentage_formula(re_data_metric_nulls_count(context), re_data_metric_row_count()) }} 61 | {% endmacro %} 62 | 63 | {% macro re_data_metric_missing_percent(context) %} 64 | {{ percentage_formula(re_data_metric_missing_count(context), re_data_metric_row_count()) }} 65 | {% endmacro %} 66 | 67 | {% macro re_data_metric_count_true(context) %} 68 | COALESCE( 69 | SUM( 70 | CASE 71 | WHEN {{ context.column_name }} IS TRUE THEN 1 72 | ELSE 0 73 | END 74 | ), 75 | 0 76 | ) 77 | {% endmacro %} 78 | 79 | {% macro re_data_metric_count_false(context) %} 80 | COALESCE( 81 | SUM( 82 | CASE 83 | WHEN {{ context.column_name }} IS FALSE THEN 1 84 | ELSE 0 85 | END 86 | ), 87 | 0 88 | ) 89 | {% endmacro %} 90 | 91 | 92 | -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/expected_anomalies.csv: -------------------------------------------------------------------------------- 1 | table_name,column_name,metric,anomaly_detector,interval_length_sec 2 | BUY_EVENTS,VALUE2,min,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 3 | BUY_EVENTS,VALUE2,avg,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 4 | BUY_EVENTS,---,freshness,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 5 | BUY_EVENTS,VALUE1,min,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 6 | BUY_EVENTS,VALUE1,max,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 7 | BUY_EVENTS,VALUE1,avg,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 8 | BUY_EVENTS,VALUE2,max,"{""name"": ""z_score"", ""threshold"": 0.5}",86400 9 | SAMPLE_TABLE,VALUE1,avg,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 10 | SAMPLE_TABLE,EVENT_TYPE,unique_rows,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 11 | SAMPLE_TABLE,EVENT_TYPE,duplicate_rows,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 12 | SAMPLE_TABLE,VALUE1,max,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 13 | SAMPLE_TABLE,EVENT_TYPE,match_regex_percent,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 14 | SAMPLE_TABLE,VALUE1,diff,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 15 | SAMPLE_TABLE,VALUE1,stddev,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 16 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 17 | SAMPLE_TABLE,EVENT_TYPE,avg_length,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 18 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex_percent,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 19 | SAMPLE_TABLE,EVENT_TYPE,match_regex,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 20 | SAMPLE_TABLE,VALUE1,variance,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 21 | SAMPLE_TABLE,EVENT_TYPE,max_length,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 22 | SAMPLE_TABLE,EVENT_TYPE,distinct_values,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 23 | SAMPLE_WITH_ANOMALY,---,freshness,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 24 | SAMPLE_WITH_ANOMALY,VALUE2,avg,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 25 | SAMPLE_WITH_ANOMALY,VALUE1,max,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 26 | SAMPLE_WITH_ANOMALY,VALUE2,min,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 27 | SAMPLE_WITH_ANOMALY,VALUE1,avg,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 28 | SAMPLE_WITH_ANOMALY,VALUE2,max,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 29 | SAMPLE_WITH_ANOMALY,VALUE1,min,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400 30 | -------------------------------------------------------------------------------- /macros/utils/generate_alert_message.sql: -------------------------------------------------------------------------------- 1 | {% macro generate_anomaly_message(column_name, metric, last_value, last_avg) -%} 2 | {{ return(adapter.dispatch('generate_anomaly_message', 're_data')(column_name, metric, last_value, last_avg)) }} 3 | {%- endmacro %} 4 | 5 | {% macro default__generate_anomaly_message(column_name, metric, last_value, last_avg) %} 6 | 7 | case when {{ column_name }} != '' then metric || '(' || column_name || ')' 8 | else metric 9 | end 10 | || ' is ' || 11 | {{ to_2dp( percentage_formula('last_value - last_avg', last_avg) ) }} 12 | || '% ' || 13 | {{ comparison_text(last_value, last_avg) }} 14 | || ' average.' 15 | {% endmacro %} 16 | 17 | {% macro to_2dp(val) %} 18 | {{ adapter.dispatch('to_2dp', 're_data')(val) }} 19 | {% endmacro %} 20 | 21 | {% macro default__to_2dp(val) %} 22 | trim(to_char({{ val }}, '9999999999999999990D00')) 23 | {% endmacro %} 24 | 25 | {% macro bigquery__to_2dp(val) %} 26 | format('%.2f', {{ val }}) 27 | {% endmacro %} 28 | 29 | {% macro seconds_to_hours(val) %} 30 | cast({{ val }} as {{ numeric_type() }}) / 3600 31 | {% endmacro %} 32 | 33 | {% macro generate_metric_value_text(metric, value) %} 34 | case 35 | when {{ metric }} = 'freshness' 36 | then cast({{ to_2dp(seconds_to_hours(value)) }} as {{ string_type() }}) || ' hours' 37 | when {{ regex_match_expression(metric, 'percent') }} 38 | then cast({{ to_2dp(value) }} as {{ string_type() }}) || '%' 39 | when {{ regex_match_expression(metric, 'count') }} 40 | then cast({{ value }} as {{ string_type() }}) 41 | else cast({{ to_2dp(value) }} as {{ string_type() }}) 42 | end 43 | 44 | {% endmacro %} 45 | 46 | {% macro generate_schema_change_message(operation, column_name, prev_column_name, prev_data_type, data_type, detected_time) %} 47 | case 48 | when {{ operation }} = 'column_added' 49 | then 'column ' || {{ column_name }} || ' of type ' || {{ data_type }} || ' was added.' 50 | when {{ operation }} = 'column_removed' 51 | then 'column ' || {{ prev_column_name }} || ' of type ' || {{ prev_data_type }} || ' was removed.' 52 | when {{ operation }} = 'type_change' 53 | then {{ column_name }} || ' column data type was changed from ' || {{ prev_data_type }} || ' to ' || {{ data_type }} || '.' 54 | else '' 55 | end 56 | {% endmacro %} 57 | 58 | {% macro generate_failed_test_message(test_name, column_name) %} 59 | case 60 | when {{ column_name }} is null 61 | then 'Test ' || {{ test_name }} || ' failed.' 62 | else 63 | 'Test ' || {{ test_name }} || ' failed for column ' || {{ column_name }} || '.' 64 | end 65 | {% endmacro %} 66 | -------------------------------------------------------------------------------- /models/alerts/re_data_z_score.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key = 'id', 5 | on_schema_change='sync_all_columns', 6 | ) 7 | }} 8 | 9 | with z_score_without_id as ( 10 | 11 | select 12 | stats.table_name as table_name, 13 | stats.column_name as column_name, 14 | stats.metric as metric, 15 | stats.interval_length_sec, 16 | (last_metric.last_value - stats.last_avg) / (stats.last_stddev + 0.0000000001) as z_score_value, 17 | case 18 | when stats.last_median_absolute_deviation = 0 then 19 | (last_metric.last_value - stats.last_median) / (1.253314 * (stats.last_mean_absolute_deviation + 0.0000000001)) 20 | else 21 | (0.6745 * (last_metric.last_value - stats.last_median)) / (stats.last_median_absolute_deviation + 0.0000000001) 22 | end as modified_z_score_value, 23 | last_metric.last_value as last_value, 24 | stats.last_avg as last_avg, 25 | stats.last_median as last_median, 26 | stats.last_stddev as last_stddev, 27 | stats.last_median_absolute_deviation, 28 | stats.last_mean_absolute_deviation, 29 | stats.last_third_quartile - stats.last_first_quartile as last_iqr, 30 | stats.last_first_quartile, 31 | stats.last_third_quartile, 32 | {{ time_window_end() }} as time_window_end, 33 | cast( {{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }} ) as computed_on 34 | from 35 | {{ ref('re_data_last_stats') }} as stats, 36 | {{ ref('re_data_last_metrics') }} as last_metric 37 | where 38 | stats.table_name = last_metric.table_name and 39 | stats.column_name = last_metric.column_name and 40 | stats.metric = last_metric.metric and 41 | ( 42 | stats.interval_length_sec = last_metric.interval_length_sec or 43 | (stats.interval_length_sec is null and last_metric.interval_length_sec is null) 44 | ) and 45 | last_metric.last_value is not null and 46 | stats.last_avg is not null and 47 | stats.last_stddev is not null 48 | ) 49 | 50 | select 51 | cast ({{ dbt_utils.generate_surrogate_key([ 52 | 'table_name', 53 | 'column_name', 54 | 'metric', 55 | 'interval_length_sec', 56 | 'time_window_end' 57 | ]) }} as {{ string_type() }} ) as id, 58 | table_name, 59 | column_name, 60 | metric, 61 | z_score_value, 62 | modified_z_score_value, 63 | last_value, 64 | last_avg, 65 | last_median, 66 | last_stddev, 67 | last_median_absolute_deviation, 68 | last_mean_absolute_deviation, 69 | last_iqr, 70 | last_first_quartile, 71 | last_third_quartile, 72 | time_window_end, 73 | interval_length_sec, 74 | computed_on 75 | 76 | from z_score_without_id 77 | -------------------------------------------------------------------------------- /models/alerts/re_data_anomalies.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | select 7 | z.id, 8 | z.table_name, 9 | z.column_name, 10 | z.metric, 11 | z.z_score_value, 12 | z.modified_z_score_value, 13 | m.anomaly_detector, 14 | z.last_value, 15 | z.last_avg, 16 | z.last_median, 17 | z.last_stddev, 18 | z.last_median_absolute_deviation, 19 | z.last_mean_absolute_deviation, 20 | z.last_iqr, 21 | z.last_first_quartile - (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr) lower_bound, 22 | z.last_third_quartile + (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr) upper_bound, 23 | z.last_first_quartile, 24 | z.last_third_quartile, 25 | z.time_window_end, 26 | z.interval_length_sec, 27 | z.computed_on, 28 | {{ re_data.generate_anomaly_message('z.column_name', 'z.metric', 'z.last_value', 'z.last_avg') }} as message, 29 | {{ re_data.generate_metric_value_text('z.metric', 'z.last_value') }} as last_value_text 30 | from 31 | {{ ref('re_data_z_score')}} z 32 | left join {{ ref('re_data_selected') }} m 33 | on {{ split_and_return_nth_value('table_name', '.', 1) }} = m.database 34 | and {{ split_and_return_nth_value('table_name', '.', 2) }} = m.schema 35 | and {{ split_and_return_nth_value('table_name', '.', 3) }} = m.name 36 | where 37 | case when (lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) = 'up' and z.last_value > z.last_avg) 38 | or (lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) = 'down' and z.last_value < z.last_avg) 39 | or (lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) != 'up' and lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) != 'down') 40 | then 41 | case 42 | when {{ json_extract('m.anomaly_detector', 'name') }} = 'z_score' 43 | then abs(z_score_value) > cast({{ json_extract('m.anomaly_detector', 'threshold') }} as {{ numeric_type() }}) 44 | when {{ json_extract('m.anomaly_detector', 'name') }} = 'modified_z_score' 45 | then abs(modified_z_score_value) > cast( {{ json_extract('m.anomaly_detector', 'threshold') }} as {{numeric_type()}} ) 46 | when {{ json_extract('m.anomaly_detector', 'name') }} = 'boxplot' 47 | then ( 48 | z.last_value < z.last_first_quartile - (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr) 49 | or 50 | z.last_value > z.last_third_quartile + (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr) 51 | ) 52 | else false 53 | end 54 | else false 55 | end 56 | -------------------------------------------------------------------------------- /macros/public/validating/valid_with_regex.sql: -------------------------------------------------------------------------------- 1 | 2 | 3 | {% macro valid_regex(column_name, to_validate) %} 4 | {% set pattern = re_data.get_regex_for(to_validate) %} 5 | case when 6 | {{ column_name }} is null then false 7 | else {{ re_data.regex_match_expression(column_name, pattern) }} 8 | end 9 | {% endmacro %} 10 | 11 | {% macro valid_email(column_name) %} 12 | {{ re_data.valid_regex(column_name, 'email')}} 13 | {% endmacro %} 14 | 15 | {% macro valid_date_eu(column_name) %} 16 | {{ re_data.valid_regex(column_name, 'date_eu')}} 17 | {% endmacro %} 18 | 19 | {% macro valid_date_us(column_name) %} 20 | {{ re_data.valid_regex(column_name, 'date_us')}} 21 | {% endmacro %} 22 | 23 | {% macro valid_date_inverse(column_name) %} 24 | {{ re_data.valid_regex(column_name, 'date_inverse')}} 25 | {% endmacro %} 26 | 27 | {% macro valid_date_iso_8601(column_name) %} 28 | {{ re_data.valid_regex(column_name, 'date_iso_8601')}} 29 | {% endmacro %} 30 | 31 | {% macro valid_time_24h(column_name) %} 32 | {{ re_data.valid_regex(column_name, 'time_24h')}} 33 | {% endmacro %} 34 | 35 | {% macro valid_time_12h(column_name) %} 36 | {{ re_data.valid_regex(column_name, 'time_12h')}} 37 | {% endmacro %} 38 | 39 | {% macro valid_time(column_name) %} 40 | {{ re_data.valid_regex(column_name, 'time')}} 41 | {% endmacro %} 42 | 43 | {% macro valid_ip_v4(column_name) %} 44 | {{ re_data.valid_regex(column_name, 'ipv4_address')}} 45 | {% endmacro %} 46 | 47 | {% macro valid_ip_v6(column_name) %} 48 | {{ re_data.valid_regex(column_name, 'ipv6_address')}} 49 | {% endmacro %} 50 | 51 | {% macro valid_ip(column_name) %} 52 | ( 53 | {{ re_data.valid_regex(column_name, 'ipv4_address')}} 54 | or 55 | {{ re_data.valid_regex(column_name, 'ipv6_address')}} 56 | ) 57 | {% endmacro %} 58 | 59 | {% macro valid_number(column_name) %} 60 | {{ re_data.valid_regex(column_name, 'number_whole')}} 61 | {% endmacro %} 62 | 63 | {% macro valid_number_decimal_point(column_name) %} 64 | {{ re_data.valid_regex(column_name, 'number_decimal_point')}} 65 | {% endmacro %} 66 | 67 | {% macro valid_number_decimal_comma(column_name) %} 68 | {{ re_data.valid_regex(column_name, 'number_decimal_comma')}} 69 | {% endmacro %} 70 | 71 | {% macro valid_number_percentage(column_name) %} 72 | {{ re_data.valid_regex(column_name, 'number_percentage')}} 73 | {% endmacro %} 74 | 75 | {% macro valid_number_percentage_point(column_name) %} 76 | {{ re_data.valid_regex(column_name, 'number_percentage_point')}} 77 | {% endmacro %} 78 | 79 | {% macro valid_number_percentage_comma(column_name) %} 80 | {{ re_data.valid_regex(column_name, 'number_percentage_comma')}} 81 | {% endmacro %} 82 | 83 | {% macro valid_phone(column_name) %} 84 | {{ re_data.valid_regex(column_name, 'phone')}} 85 | {% endmacro %} 86 | 87 | {% macro valid_uuid(column_name) %} 88 | {{ re_data.valid_regex(column_name, 'uuid')}} 89 | {% endmacro %} 90 | 91 | {% macro valid_credit_card(column_name) %} 92 | {{ re_data.valid_regex(column_name, 'credit_card_number')}} 93 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: "re_data_integration_tests" 2 | version: "1.0" 3 | config-version: 2 4 | 5 | # Changed when running tests 6 | profile: "re_data_postgres" 7 | 8 | target-path: "target" 9 | clean-targets: ["target", "dbt_modules", "dbt_packages"] 10 | 11 | models: 12 | re_data: 13 | enabled: true 14 | 15 | sources: 16 | re_data_integration_tests: 17 | +re_data_monitored: true 18 | +re_data_time_filter: null 19 | 20 | vars: 21 | re_data:store_table_samples: true 22 | re_data:anomaly_detector: 23 | name: modified_z_score 24 | threshold: 0.6 25 | re_data:max_columns_in_query: 1 26 | 27 | re_data:select: 28 | - tag:testtag 29 | - sample_table 30 | - sample_without_time_filter 31 | - sample_with_anomaly 32 | - re_data_source_test_table 33 | 34 | re_data:metrics_groups: 35 | integration_test_group: 36 | table: 37 | - row_count 38 | - freshness 39 | - my_distinct_table_rows 40 | 41 | column: 42 | numeric: 43 | - min 44 | - max 45 | - avg 46 | - stddev 47 | - variance 48 | - nulls_count 49 | - nulls_percent 50 | - diff # my own custom metric 51 | 52 | text: 53 | - min_length 54 | - max_length 55 | - avg_length 56 | - nulls_count 57 | - nulls_percent 58 | - missing_percent 59 | - missing_count 60 | 61 | re_data:default_metrics: 62 | - integration_test_group 63 | 64 | seeds: 65 | +schema: seeds 66 | +quote_columns: false 67 | 68 | re_data_integration_tests: 69 | monitoring: 70 | sample_with_anomaly: 71 | +re_data_monitored: true 72 | +re_data_time_filter: creation_time 73 | 74 | sample_without_time_filter: 75 | +re_data_monitored: true 76 | +re_data_time_filter: null 77 | 78 | sample_table: 79 | +re_data_monitored: true 80 | +re_data_time_filter: creation_time 81 | 82 | +re_data_columns: 83 | - event_type 84 | - value1 85 | - value2 86 | - null_value 87 | 88 | +re_data_metrics: 89 | table: 90 | - my_custom_table_metric # my own custom metric 91 | - distinct_table_rows 92 | column: 93 | event_type: 94 | - regex_test: 95 | regex: ([A-Za-z0-9]+) 96 | - match_regex: 97 | regex: ^sell 98 | - match_regex_percent: 99 | regex: ^sell 100 | - not_match_regex: 101 | regex: ^buy 102 | - not_match_regex_percent: 103 | regex: ^buy 104 | - distinct_values 105 | - duplicate_values 106 | - duplicate_rows 107 | - unique_rows 108 | 109 | expected_z_score: 110 | +column_types: 111 | time_window_end: "TIMESTAMP" 112 | 113 | expected_metrics: 114 | +column_types: 115 | time_window_start: "TIMESTAMP" 116 | time_window_end: "TIMESTAMP" 117 | -------------------------------------------------------------------------------- /macros/metrics/base/queries.sql: -------------------------------------------------------------------------------- 1 | {% macro metrics_base_compute_for_thread(thread_value, ref_model) %} 2 | {%- set tables = run_query(re_data.get_tables()) %} 3 | {%- for mtable in tables %} 4 | -- we are splitting computing metrics to 4 different threads 5 | {% set for_loop_mod = (loop.index % 4) %} 6 | {% if for_loop_mod == thread_value %} 7 | {% set model = get_model_config(mtable) %} 8 | 9 | {% set columns_to_query = [] %} 10 | {% set size = 0 %} 11 | 12 | {% for column in model.columns %} 13 | {% set column_name = re_data.row_value(column, 'column_name') %} 14 | 15 | {% if should_compute_metric(model, column_name) %} 16 | {% do columns_to_query.append(column) %} 17 | {% endif %} 18 | 19 | {% set columns_size = columns_to_query| length %} 20 | 21 | {% if columns_size == var('re_data:max_columns_in_query') %} 22 | {%- set insert_stats_query = re_data.metrics_base_insert(model, ref_model, columns_to_query) -%} 23 | 24 | {% if insert_stats_query %} 25 | {% do run_query(insert_stats_query) %} 26 | {% endif %} 27 | {% do columns_to_query.clear() %} 28 | {% endif %} 29 | {% endfor %} 30 | 31 | {%- set insert_stats_query = re_data.metrics_base_insert(model, ref_model, columns_to_query, table_level=True) -%} 32 | {% do run_query(insert_stats_query) %} 33 | 34 | {{ dbt_utils.log_info('[re_data_log] - finished computing metrics for:' ~ model.model_name) }} 35 | {% endif %} 36 | {% endfor %} 37 | {% endmacro %} 38 | 39 | {% macro metrics_base_insert(model, ref_model, columns, table_level=False) %} 40 | 41 | {% set col_exprs = re_data.metrics_base_expressions(model, columns, table_level) %} 42 | {% if col_exprs == [] %} 43 | {{ return ('') }} 44 | {% endif %} 45 | 46 | insert into {{ ref(ref_model) }} 47 | with temp_table_metrics as ( 48 | select 49 | {%- for col_expr in col_exprs %} 50 | ( {{ col_expr.expr }} ) as {{ re_data.quote_column_name(col_expr.col_name + '___' + col_expr.metric) }} 51 | {%- if not loop.last %},{%- endif %} 52 | {% endfor %} 53 | from 54 | {{ model.table_name }} 55 | where 56 | {{ in_time_window(model.time_filter) }} 57 | ) 58 | 59 | {%- for col_expr in col_exprs %} 60 | {% set final_metric_name = get_final_metric_name(col_expr.metric, model.time_filter) %} 61 | 62 | select '{{model.table_name}}' as table_name, '{{ col_expr.col_name }}' as column_name, '{{ final_metric_name }}' as metric, {{ re_data.quote_column_name(col_expr.col_name + '___' + col_expr.metric) }} as value 63 | from temp_table_metrics 64 | {% if not loop.last %}union all{% endif %} 65 | {% endfor %} 66 | 67 | {% endmacro %} 68 | 69 | {% macro get_final_metric_name(metric_name, time_filter) %} 70 | {% if time_filter is none %} 71 | {{ return ('global__' + metric_name) }} 72 | {% else %} 73 | {{ return (metric_name) }} 74 | {% endif %} 75 | {% endmacro %} -------------------------------------------------------------------------------- /integration_tests/python_tests/test_monitoring.py: -------------------------------------------------------------------------------- 1 | import os 2 | import copy 3 | import yaml 4 | import json 5 | from datetime import datetime, timedelta 6 | from .utils.run import dbt_seed, dbt_run, dbt_test, dbt_command, dbt_build 7 | 8 | RUN_TIME = datetime(2021, 5, 2, 0, 0, 0) 9 | 10 | DBT_VARS = { 11 | 're_data:time_window_start': (RUN_TIME - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S"), 12 | 're_data:time_window_end': RUN_TIME.strftime("%Y-%m-%d %H:%M:%S"), 13 | 're_data:save_test_history': True 14 | } 15 | 16 | def test_monitoring(db, source_schema): 17 | DBT_VARS.update({'source_schema': source_schema}) 18 | 19 | load_deps = 'dbt deps' 20 | assert os.system(load_deps) == 0 21 | 22 | dbt_vars = copy.deepcopy(DBT_VARS) 23 | 24 | print (f"Running setup and tests for {db}") 25 | 26 | dbt_seed('--select monitoring', db, dbt_vars) 27 | dbt_run('--models transformed', db, dbt_vars) 28 | dbt_command( 29 | f'dbt run-operation create_test_source_tables', 30 | db, dbt_vars 31 | ) 32 | 33 | print (f"Computing re_data metrics for {db}") 34 | dbt_run('--select package:re_data', db, dbt_vars) 35 | 36 | dbt_command( 37 | f'dbt run-operation schema_change_buy_events_add_column', 38 | db, dbt_vars 39 | ) 40 | 41 | # update dbts_vars to run dbt for next day of data 42 | dbt_vars['re_data:time_window_start'] = dbt_vars['re_data:time_window_end'] 43 | dbt_vars['re_data:time_window_end'] = (RUN_TIME + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S") 44 | 45 | dbt_command( 46 | 'dbt run --select package:re_data --fail-fast', 47 | db, dbt_vars 48 | ) 49 | 50 | dbt_command( 51 | 'dbt run --select monitoring.*', db, dbt_vars 52 | ) 53 | 54 | dbt_test('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score test_re_data_table_samples re_data_metrics transformed', db, dbt_vars) 55 | # dbt build will "duplicate" saved test result history 56 | dbt_build('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score test_re_data_table_samples re_data_metrics transformed', db, dbt_vars) 57 | 58 | # tests test_history seperately, because those are actually added to DB after running 59 | # dbt test command 60 | dbt_test('--select test_re_data_test_history', db, dbt_vars) 61 | 62 | op_vars = { 63 | 'start_date': RUN_TIME.strftime("%Y-%m-%d"), 64 | 'end_date': (RUN_TIME + timedelta(days=1)).strftime("%Y-%m-%d"), 65 | 'interval': 'days:1' 66 | } 67 | op_vars = yaml.dump(op_vars) 68 | 69 | dbt_command( 70 | f'dbt run-operation generate_overview --args "{op_vars}"', 71 | db, dbt_vars 72 | ) 73 | 74 | overview = json.load(open(f'../target/re_data/overview.json')) 75 | expected_types = ['metric', 'schema_change', 'schema', 'alert', 'anomaly'] 76 | all_types = set() 77 | 78 | # some simple check for now 79 | for obj in overview: 80 | all_types.add(obj['type']) 81 | assert obj['table_name'] 82 | assert 'column_name' in obj 83 | assert 'computed_on' in obj 84 | 85 | assert len(overview) > 100 86 | assert sorted(all_types) == sorted(expected_types) 87 | 88 | print (f"Running tests completed for {db}") 89 | -------------------------------------------------------------------------------- /macros/utils/time_macros.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro time_window_start() %} 3 | cast('{{- var('re_data:time_window_start') -}}' as timestamp) 4 | {% endmacro %} 5 | 6 | 7 | {% macro time_window_end() %} 8 | cast('{{- var('re_data:time_window_end') -}}' as timestamp) 9 | {% endmacro %} 10 | 11 | 12 | {% macro anamaly_detection_time_window_start() %} 13 | {{ adapter.dispatch('anamaly_detection_time_window_start', 're_data')() }} 14 | {% endmacro %} 15 | 16 | {% macro default__anamaly_detection_time_window_start() %} 17 | {{ time_window_start() }} - interval '{{var('re_data:anomaly_detection_look_back_days')}} days' 18 | {% endmacro %} 19 | 20 | {% macro bigquery__anamaly_detection_time_window_start() %} 21 | DATE_ADD({{ time_window_start() }}, INTERVAL -{{var('re_data:anomaly_detection_look_back_days')}} DAY) 22 | {% endmacro %} 23 | 24 | {% macro snowflake__anamaly_detection_time_window_start() %} 25 | DATEADD('DAY', -{{-var('re_data:anomaly_detection_look_back_days')-}}, {{ time_window_start() }}) 26 | {% endmacro %} 27 | 28 | 29 | {% macro interval_length_sec(start_timestamp, end_timestamp) %} 30 | {{ adapter.dispatch('interval_length_sec', 're_data')(start_timestamp, end_timestamp) }} 31 | {% endmacro %} 32 | 33 | {% macro default__interval_length_sec(start_timestamp, end_timestamp) %} 34 | EXTRACT(EPOCH FROM ({{ end_timestamp }} - {{ start_timestamp }} )) 35 | {% endmacro %} 36 | 37 | {% macro bigquery__interval_length_sec(start_timestamp, end_timestamp) %} 38 | TIMESTAMP_DIFF ({{ end_timestamp }}, {{ start_timestamp }}, SECOND) 39 | {% endmacro %} 40 | 41 | {% macro snowflake__interval_length_sec(start_timestamp, end_timestamp) %} 42 | timediff(second, {{ start_timestamp }}, {{ end_timestamp }}) 43 | {% endmacro %} 44 | 45 | {% macro redshift__interval_length_sec(start_timestamp, end_timestamp) %} 46 | DATEDIFF(second, {{ start_timestamp }}, {{ end_timestamp }}) 47 | {% endmacro %} 48 | 49 | {%- macro in_time_window(time_column) %} 50 | {# /* If not time_filter is specified, we compute the metric over the entire table else we filter for the time frame */ #} 51 | {% if time_column is none %} 52 | true 53 | {% else %} 54 | {{ adapter.dispatch('in_time_window', 're_data')(time_column) }} 55 | {% endif %} 56 | {% endmacro -%} 57 | 58 | {% macro default__in_time_window(time_column) %} 59 | {{time_column}} >= {{ time_window_start() }} and 60 | {{time_column}} < {{ time_window_end() }} 61 | {% endmacro %} 62 | 63 | {% macro bigquery__in_time_window(time_column) %} 64 | cast({{time_column}} as timestamp) >= {{ time_window_start() }} and 65 | cast({{time_column}} as timestamp) < {{ time_window_end() }} 66 | {% endmacro %} 67 | 68 | 69 | {% macro format_timestamp(column_name) %} 70 | {{ adapter.dispatch('format_timestamp', 're_data')(column_name) }} 71 | {% endmacro %} 72 | 73 | {% macro default__format_timestamp(column_name) %} 74 | to_char({{column_name}}, 'YYYY-MM-DD HH24:MI:SS') 75 | {% endmacro %} 76 | 77 | {% macro bigquery__format_timestamp(column_name) %} 78 | FORMAT_TIMESTAMP('%Y-%m-%d %H:%I:%S', {{column_name}}) 79 | {% endmacro %} 80 | 81 | /* 82 | provide a common way to compare time vs a range: start_date <= target <= end_date 83 | if start_date is none: target <= end_date 84 | if end_date is none: target >= start_date 85 | think none as infinity 86 | */ 87 | {%- macro in_date_window(target, start_date, end_date) %} 88 | {{ adapter.dispatch('in_date_window','re_data')(target, start_date, end_date) }} 89 | {% endmacro -%} 90 | 91 | {% macro default__in_date_window(target, start_date, end_date) %} 92 | {% if start_date is not none and end_date is not none %} 93 | date({{target}}) between '{{start_date}}' and '{{end_date}}' 94 | {% elif start_date is none %} 95 | date({{target}}) <= '{{end_date}}' 96 | {% elif end_date is none %} 97 | date({{target}}) >= '{{start_date}}' 98 | {% endif %} 99 | {% endmacro %} 100 | 101 | -------------------------------------------------------------------------------- /models/metrics/for_anomalies/re_data_last_stats.sql: -------------------------------------------------------------------------------- 1 | {% set columns_to_group_by = 'table_name, column_name, metric, interval_length_sec' %} 2 | 3 | with median_value as ( 4 | select distinct 5 | table_name, 6 | column_name, 7 | metric, 8 | interval_length_sec, 9 | avg(value) {% if target.type not in postgres_type_db() %} over(partition by {{ columns_to_group_by }}) {% endif %} as last_avg, 10 | {{ percentile(percentile_field='value', partition_field=columns_to_group_by, percent='0.25') }} as last_first_quartile, 11 | {{ percentile(percentile_field='value', partition_field=columns_to_group_by, percent='0.5') }} as last_median, 12 | {{ percentile(percentile_field='value', partition_field=columns_to_group_by, percent='0.75') }} as last_third_quartile 13 | from 14 | {{ ref('re_data_base_metrics') }} 15 | where 16 | time_window_end > {{- anamaly_detection_time_window_start() -}} and 17 | time_window_end <= {{- time_window_end() -}} 18 | {% if target.type in postgres_type_db() %} 19 | group by 20 | {{ columns_to_group_by }} 21 | {% endif %} 22 | 23 | ), abs_deviation as ( 24 | select 25 | s.table_name, 26 | s.column_name, 27 | s.metric, 28 | s.interval_length_sec, 29 | abs( s.value - mv.last_avg ) as absolute_deviation_from_mean, 30 | abs( s.value - mv.last_median ) as absolute_deviation_from_median 31 | from 32 | {{ ref('re_data_base_metrics') }} s 33 | left join 34 | median_value mv 35 | on 36 | s.table_name = mv.table_name and 37 | s.column_name = mv.column_name and 38 | s.metric = mv.metric and 39 | s.interval_length_sec = mv.interval_length_sec 40 | where 41 | s.time_window_end > {{- anamaly_detection_time_window_start() -}} and 42 | s.time_window_end <= {{- time_window_end() -}} 43 | ), median_abs_deviation as ( 44 | select distinct 45 | table_name, 46 | column_name, 47 | metric, 48 | interval_length_sec, 49 | avg(absolute_deviation_from_mean) {% if target.type not in postgres_type_db() %} over(partition by {{ columns_to_group_by }}) {% endif %} as mean_absolute_deviation, 50 | {{ percentile(percentile_field='absolute_deviation_from_median', partition_field=columns_to_group_by, percent='0.5') }} as median_absolute_deviation 51 | from 52 | abs_deviation 53 | {% if target.type in postgres_type_db() %} 54 | group by 55 | {{ columns_to_group_by }} 56 | {% endif %} 57 | ), stats as ( 58 | select 59 | table_name, 60 | column_name, 61 | metric, 62 | stddev(value) as last_stddev, 63 | max(time_window_end) as last_metric_time, 64 | interval_length_sec, 65 | max(computed_on) as computed_on 66 | from 67 | {{ ref('re_data_base_metrics') }} 68 | where 69 | time_window_end > {{- anamaly_detection_time_window_start() -}} and 70 | time_window_end <= {{- time_window_end() -}} 71 | group by 72 | {{ columns_to_group_by }} 73 | ) 74 | 75 | select 76 | s.table_name, 77 | s.column_name, 78 | s.metric, 79 | mv.last_avg, 80 | s.last_stddev, 81 | s.last_metric_time, 82 | s.interval_length_sec, 83 | s.computed_on, 84 | mv.last_median, 85 | mv.last_first_quartile, 86 | mv.last_third_quartile, 87 | md.median_absolute_deviation last_median_absolute_deviation, 88 | md.mean_absolute_deviation last_mean_absolute_deviation 89 | from 90 | stats s 91 | left join 92 | median_value mv 93 | on 94 | s.table_name = mv.table_name and 95 | s.column_name = mv.column_name and 96 | s.metric = mv.metric and 97 | s.interval_length_sec = mv.interval_length_sec 98 | left join 99 | median_abs_deviation md 100 | on 101 | s.table_name = md.table_name and 102 | s.column_name = md.column_name and 103 | s.metric = md.metric and 104 | s.interval_length_sec = md.interval_length_sec 105 | -------------------------------------------------------------------------------- /macros/metrics/base/build_in/optional_column_metrics.sql: -------------------------------------------------------------------------------- 1 | {% macro re_data_metric_regex_count(column_name, pattern) %} 2 | coalesce( 3 | sum( 4 | case when {{ regex_match_expression(column_name, pattern) }} 5 | then 1 6 | else 0 7 | end 8 | ), 0 9 | ) 10 | {% endmacro %} 11 | 12 | {% macro re_data_metric_match_regex(context) %} 13 | {{ re_data_metric_regex_count(context.column_name, context.config.regex) }} 14 | {% endmacro %} 15 | 16 | {% macro re_data_metric_match_regex_percent(context) %} 17 | {{ percentage_formula(re_data_metric_match_regex(context), re_data_metric_row_count()) }} 18 | {% endmacro %} 19 | 20 | {% macro re_data_metric_not_match_regex(context) %} 21 | {{ re_data_metric_row_count() }} - {{ re_data_metric_regex_count(context.column_name, context.config.regex) }} 22 | {% endmacro %} 23 | 24 | {% macro re_data_metric_not_match_regex_percent(context) %} 25 | {{ percentage_formula(re_data_metric_not_match_regex(context), re_data_metric_row_count()) }} 26 | {% endmacro %} 27 | 28 | {% macro re_data_metric_distinct_values(context) %} 29 | {{ distinct_values(context) }} 30 | {% endmacro %} 31 | 32 | {% macro distinct_values(context) %} 33 | {{ adapter.dispatch('distinct_values', 're_data')(context) }} 34 | {% endmacro %} 35 | 36 | {% macro default__distinct_values(context) %} 37 | coalesce( 38 | count(distinct {{ context.column_name }} ) 39 | , 0) 40 | {% endmacro %} 41 | 42 | {% macro postgres__distinct_values(context) %} 43 | {# /* In postgres, its faster to count distinct values in a column by selecting then counting in separate steps */ #} 44 | with temp_table as ( 45 | select distinct {{ context.column_name }} from {{ context.table_name }} 46 | where {{ in_time_window(context.time_filter) }} 47 | ) 48 | select coalesce(count(*), 0) from temp_table 49 | {% endmacro %} 50 | 51 | {% macro re_data_metric_approx_distinct_values(context) %} 52 | {{ approx_distinct_values(context) }} 53 | {% endmacro %} 54 | 55 | {% macro approx_distinct_values(context) %} 56 | {{ adapter.dispatch('approx_distinct_values', 're_data')(context) }} 57 | {% endmacro %} 58 | 59 | {% macro default__approx_distinct_values(context) %} 60 | {# /* No approximate distinct count in postgres so we default to using a distinct count */ #} 61 | {{ re_data_metric_distinct_values(context) }} 62 | {% endmacro %} 63 | 64 | {% macro redshift__approx_distinct_values(context) %} 65 | approximate {{ re_data_metric_distinct_values(context.column_name) }} 66 | {% endmacro %} 67 | 68 | {% macro bigquery__approx_distinct_values(context) %} 69 | approx_count_distinct({{ context.column_name }}) 70 | {% endmacro %} 71 | 72 | {% macro snowflake__approx_distinct_values(context) %} 73 | approx_count_distinct({{ context.column_name }}) 74 | {% endmacro %} 75 | 76 | {% macro re_data_metric_duplicate_values(context) %} 77 | with temp_table as ( 78 | select {{ context.column_name }} from {{ context.table_name }} 79 | where {{ in_time_window(context.time_filter) }} 80 | group by {{ context.column_name }} 81 | having count(1) > 1 82 | ) 83 | select coalesce(count(*), 0) from temp_table 84 | {% endmacro %} 85 | 86 | {% macro re_data_metric_duplicate_rows(context) %} 87 | with temp_table as ( 88 | select {{ context.column_name }}, count(1) as row_count from {{ context.table_name }} 89 | where {{ in_time_window(context.time_filter) }} 90 | group by {{ context.column_name }} 91 | having count(1) > 1 92 | ) 93 | select coalesce(sum(row_count), 0) from temp_table 94 | {% endmacro %} 95 | 96 | {% macro re_data_metric_unique_rows(context) %} 97 | with temp_table as ( 98 | select {{ context.column_name }}, count(1) as row_count from {{ context.table_name }} 99 | where {{ in_time_window(context.time_filter) }} 100 | group by {{ context.column_name }} 101 | having count(1) = 1 102 | ) 103 | select coalesce(sum(row_count), 0) from temp_table 104 | {% endmacro %} -------------------------------------------------------------------------------- /macros/meta/get_monitored.sql: -------------------------------------------------------------------------------- 1 | {% macro pub_monitored_from_graph() %} 2 | {% set monitored = [] %} 3 | {% set both = []%} 4 | {% do both.extend(graph.nodes.values()) %} 5 | {% do both.extend(graph.sources.values()) %} 6 | {% set owners_config = re_data.get_owners_config() %} 7 | 8 | {% set select_var = var('re_data:select') %} 9 | {% set select_all = true %} 10 | 11 | {% set selected_nodes = none %} 12 | {% set selected_tags = none %} 13 | 14 | {% if select_var is not none %} 15 | {% set select_all = false %} 16 | {% set selected_nodes = dict() %} 17 | {% set selected_tags = dict() %} 18 | 19 | {% for el in select_var %} 20 | {% if el.startswith('tag:') %} 21 | {% do selected_tags.update({el[4:]: True}) %} 22 | {% else %} 23 | {% do selected_nodes.update({el: True}) %} 24 | {% endif %} 25 | {% endfor %} 26 | {% endif %} 27 | 28 | {% for el in both %} 29 | {% if el.resource_type in ['model', 'seed', 'source'] %} 30 | {% if el.config.get('re_data_monitored') %} 31 | {% set target_name = el.identifier or el.alias or el.name %} 32 | 33 | {% if select_all %} 34 | {% set selected = true %} 35 | {% else %} 36 | {% set selected_name = selected_nodes.get(target_name, false) %} 37 | {% set selected_tag = [] %} 38 | 39 | {% for tag in el.tags %} 40 | {% if selected_tags.get(tag, false) %} 41 | {% do selected_tag.append(true) %} 42 | {% endif %} 43 | {% endfor %} 44 | 45 | {% set selected = selected_name or (selected_tag | length > 0) %} 46 | {% endif %} 47 | 48 | {% set metrics_groups = el.config.get('re_data_metrics_groups', var('re_data:default_metrics')) %} 49 | {% set additional_metrics = el.config.get('re_data_metrics', {}) %} 50 | 51 | {% do monitored.append({ 52 | 'name': re_data.name_in_db(target_name), 53 | 'schema': re_data.name_in_db(el.schema), 54 | 'database': re_data.name_in_db(el.database), 55 | 'time_filter': el.config.get('re_data_time_filter', none), 56 | 'metrics_groups': metrics_groups, 57 | 'additional_metrics': re_data.metrics_in_db(additional_metrics), 58 | 'metrics': re_data.metrics_in_db(re_data.final_metrics(metrics_groups, additional_metrics)), 59 | 'columns': re_data.columns_in_db(el.config.get('re_data_columns', none)), 60 | 'anomaly_detector': el.config.get('re_data_anomaly_detector', var('re_data:anomaly_detector', {})), 61 | 'owners': re_data.prepare_model_owners(el.config.get('re_data_owners', []), owners_config), 62 | 'selected': selected 63 | }) 64 | %} 65 | {% endif %} 66 | {% endif %} 67 | {% endfor %} 68 | 69 | {{ return(monitored) }} 70 | {% endmacro %} 71 | 72 | {% macro get_owners_config() %} 73 | {% set owners_config = var('re_data:owners_config', {}) %} 74 | {{ return (owners_config) }} 75 | {% endmacro %} 76 | 77 | {% macro prepare_model_owners(re_data_owners, owners_config) %} 78 | {% set owners = {} %} 79 | {% set seen_identifiers = {} %} 80 | {% for owner in re_data_owners if owners_config.get(owner) %} 81 | {% set members = owners_config.get(owner) %} 82 | {% for member in members %} 83 | {% set identifier = member.get('identifier') %} 84 | {% if identifier not in seen_identifiers %} 85 | {% do seen_identifiers.update({identifier: true }) %} 86 | {% do owners.update({ 87 | identifier: { 88 | 'notify_channel': member.get('type'), 89 | 'owner': owner, 90 | 'name': member.get('name') 91 | } 92 | }) %} 93 | {% endif %} 94 | {% endfor %} 95 | {% endfor %} 96 | {{ return (owners) }} 97 | {% endmacro %} -------------------------------------------------------------------------------- /macros/metrics/base/expression.sql: -------------------------------------------------------------------------------- 1 | {% macro metrics_base_expressions(model, columns, table_level=False) %} 2 | 3 | {% set col_expr = [] %} 4 | 5 | {% for col in columns %} 6 | {% set column_name = re_data.row_value(col, 'column_name') %} 7 | {% do col_expr.extend(re_data.metrics_base_expression_column_all(model, col)) %} 8 | {% endfor %} 9 | 10 | {% if table_level %} 11 | {% do col_expr.extend(re_data.metrics_base_expresion_table_all(model)) %} 12 | {% endif %} 13 | 14 | {{ return (col_expr) }} 15 | 16 | {% endmacro %} 17 | 18 | {% macro metrics_base_expression_column_all(model, column) %} 19 | 20 | {%- set col_expr = [] %} 21 | {%- set metrics_to_compute = [] %} 22 | {% set column_name = re_data.row_value(column, 'column_name') %} 23 | {% set data_type = model.columns_info[column_name].data_type %} 24 | {% do metrics_to_compute.extend(model.metrics.get('group').get('column', {}).get(data_type, [])) %} 25 | {% do metrics_to_compute.extend(model.metrics.get('additional').get('column', {}).get(column_name, [])) %} 26 | 27 | {% for metric_value in metrics_to_compute %} 28 | {% set metric_obj = re_data.extract_metric_config(metric_value) %} 29 | {% set expression = re_data.metrics_base_expression_column(model, column_name, metric_obj['metric'], metric_obj['config']) %} 30 | {% do col_expr.append({ 'expr': expression, 'col_name': column_name, 'metric': metric_obj['metric']}) %} 31 | {% endfor %} 32 | 33 | {{ return (col_expr) }} 34 | 35 | {% endmacro %} 36 | 37 | 38 | {% macro metrics_base_expresion_table_all(model) %} 39 | {%- set table_expr = [] %} 40 | {%- set metrics_to_compute = [] %} 41 | {% do metrics_to_compute.extend(model.metrics.get('group').get('table', [])) %} 42 | {% do metrics_to_compute.extend(model.metrics.get('additional').get('table', [])) %} 43 | 44 | {% for metric_value in metrics_to_compute %} 45 | {% set metric_obj = re_data.extract_metric_config(metric_value) %} 46 | {% set expression = re_data.metrics_base_expression_table(model, metric_obj['metric'], metric_obj['config']) %} 47 | {% do table_expr.append({ 'expr': expression, 'col_name': '', 'metric': metric_obj['metric']}) %} 48 | {% endfor %} 49 | 50 | {{ return (table_expr) }} 51 | 52 | {% endmacro %} 53 | 54 | {% macro metrics_base_expression_table(model, metric_name, config) %} 55 | {% set metric_macro = re_data.get_metric_macro(metric_name) %} 56 | {% set context = {'time_filter': model.time_filter, 'metric_name': metric_name, 'config': config, 'table_name': model.table_name, 'column_name': none} %} 57 | 58 | {{ metric_macro(context) }} 59 | 60 | {% endmacro %} 61 | 62 | 63 | {%- macro metrics_base_expression_column(model, column_name, metric_name, config) %} 64 | {% set metric_macro = re_data.get_metric_macro(metric_name) %} 65 | {% set context = {'time_filter': model.time_filter, 'metric_name': metric_name, 'config': config, 'table_name': model.table_name, 'column_name': re_data.quote_column_name(column_name)} %} 66 | 67 | {{ metric_macro(context) }} 68 | 69 | {% endmacro %} 70 | 71 | {% macro extract_metric_config(metric_value) %} 72 | 73 | {% set config = none %} 74 | 75 | {% if metric_value is mapping %} 76 | {% set metric = metric_value.keys() | first %} 77 | {% if metric_value[metric] is none %} 78 | {{ exceptions.raise_compiler_error("Empty configuration passed for metric: " ~ metric ~ ". If the metric doesn't use a config, please use the column name as a string.") }} 79 | {% endif %} 80 | 81 | {% set config = metric_value[metric] %} 82 | {%- else %} 83 | {% set metric = metric_value %} 84 | {% endif %} 85 | 86 | {{ return ({'metric': metric, 'config': config}) }} 87 | 88 | {% endmacro %} 89 | 90 | {%- macro get_metric_macro(metric_name) %} 91 | {% set macro_name = 're_data_metric' + '_' + metric_name %} 92 | 93 | {% if context['re_data'].get(macro_name) %} 94 | {% set metric_macro = context['re_data'][macro_name] %} 95 | {%- else %} 96 | {% set metric_macro = context[project_name][macro_name] %} 97 | {% endif %} 98 | 99 | {{ return (metric_macro) }} 100 | 101 | {% endmacro %} 102 | 103 | -------------------------------------------------------------------------------- /macros/public/validating/regex_dict.sql: -------------------------------------------------------------------------------- 1 | {# 2 | # This file contains significant part of code licensed under: 3 | # Copyright 2020 Soda 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # Unless required by applicable law or agreed to in writing, software 9 | # distributed under the License is distributed on an "AS IS" BASIS, 10 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 11 | # See the License for the specific language governing permissions and 12 | # limitations under the License. 13 | #} 14 | 15 | {% macro get_regex_for(to_validate) %} 16 | 17 | {% set regexp_dict = { 18 | 'number_whole': '^\-?[0-9]+$', 19 | 'number_decimal_point': '^\-?[0-9]+\.[0-9]+$', 20 | 'number_decimal_comma': '^\-?[0-9]+,[0-9]+$', 21 | 'number_percentage': '^\-?[0-9]+([\.,][0-9]+)? ?%$', 22 | 'number_percentage_point': '^\-?[0-9]+([\.][0-9]+)? ?%$', 23 | 'number_percentage_comma': '^\-?[0-9]+([,][0-9]+)? ?%$', 24 | 'date_eu': '^([1-9]|0[1-9]|[12][0-9]|3[01])[-\./]([1-9]|0[1-9]|1[012])[-\./](19|20)?[0-9][0-9]$', 25 | 'date_us': '^([1-9]|0[1-9]|1[012])[-\./]([1-9]|0[1-9]|[12][0-9]|3[01])[-\./](19|20)?[0-9][0-9]$', 26 | 'date_inverse': '^(19|20)[0-9][0-9][-\./]?([1-9]|0[1-9]|1[012])[-\./]?([1-9]|0[1-9]|[12][0-9]|3[01])$', 27 | 'time_24h': '^([01][0-9]|2[0-3]):([0-5][0-9])$', 28 | 'time_12h': '^(1[0-2]|0?[1-9]):[0-5][0-9]$', 29 | 'time': '^([0-9]|1[0-9]|2[0-4])[:-]([0-9]|[0-5][0-9])([:-]([0-9]|[0-5][0-9])(,[0-9]+)?)?$', 30 | 'date_iso_8601': 31 | '^' 32 | '([1-9][0-9]{3}-((0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-8])|(0[13-9]|1[0-2])-(29|30)|(0[13578]|1[02])-31)|' 33 | '([1-9][0-9](0[48]|[2468][048]|[13579][26])|([2468][048]|[13579][26])00)-02-29)' 34 | 35 | 'T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\.[0-9]+)?' 36 | 37 | '(Z|[+-][01][0-9]:[0-5][0-9])?' 38 | '$', 39 | 'uuid': '^[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}$', 40 | 'ipv4_address': '^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$', 41 | 'ipv6_address': '^((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:)))(%.+)?$', 42 | 'email': '^[A-Za-z0-9.-_%]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}$', 43 | } %} 44 | 45 | {% set base_regex = regexp_dict[to_validate] %} 46 | {% set qualifed_regex = adapter.dispatch('get_regex_for', 're_data')(base_regex) %} 47 | {{ return(qualifed_regex) }} 48 | 49 | {% endmacro %} 50 | 51 | {% macro default__get_regex_for(pattern) %} 52 | {{ return (pattern) }} 53 | {% endmacro %} 54 | 55 | {% macro redshift__get_regex_for(pattern) %} 56 | {% set changed = modules.re.sub('\.', '\\.', pattern) %} 57 | {% set changed = modules.re.sub('\-', '\\-', changed) %} 58 | {{ return (changed) }} 59 | {% endmacro %} 60 | 61 | {% macro snowflake__get_regex_for(pattern) %} 62 | {% set changed = modules.re.sub('\.', '\\.', pattern) %} 63 | {% set changed = modules.re.sub('\-', '\\-', changed) %} 64 | {{ return (changed) }} 65 | {% endmacro %} -------------------------------------------------------------------------------- /macros/run_end/save_results_history.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro save_test_history(results) -%} 3 | 4 | {{ adapter.dispatch('save_test_history_impl', 're_data') (results) }} 5 | 6 | {%- endmacro %} 7 | 8 | {% macro default__save_test_history_impl(results) %} 9 | -- depends_on: {{ ref('re_data_test_history') }} 10 | {% set command = flags.WHICH %} 11 | {% if execute and results and command in ('test', 'build') %} 12 | {% set tests = [] %} 13 | {% for el in results %} 14 | {% if el.node.resource_type.value == 'test' %} 15 | {% do tests.append(re_data.test_data_dict(el)) %} 16 | {% endif %} 17 | {% endfor %} 18 | 19 | {% if tests %} 20 | {% do re_data.insert_list_to_table( 21 | ref('re_data_test_history'), 22 | tests, 23 | ['table_name', 'column_name', 'test_name', 'status', 'execution_time', 'message', 'failures_count', 'failures_json', 'failures_table', 'severity', 'compiled_sql', 'run_at'], 24 | { 'run_at': timestamp_type() } 25 | ) %} 26 | {% endif %} 27 | 28 | {% endif %} 29 | {{ return ('') }} 30 | 31 | {% endmacro %} 32 | 33 | {% macro test_data_dict(el) %} 34 | 35 | {% set run_started_at_str = run_started_at.strftime('%Y-%m-%d %H:%M:%S') %} 36 | 37 | {% if el.node.to_dict().get('test_metadata') %} 38 | {% set any_refs = modules.re.findall("ref\(\'(?P.*)\'\)", el.node.test_metadata.kwargs['model']) %} 39 | {% set any_source = modules.re.findall("source\(\'(?P.*)\'\,\s+\'(?P.*)\'\)", el.node.test_metadata.kwargs['model']) %} 40 | 41 | {% if any_refs %} 42 | {% set name = any_refs[0] %} 43 | {% set node_name = re_data.priv_full_name_from_depends(el.node, name) %} 44 | {% set schema = graph.nodes.get(node_name)['schema'] %} 45 | {% set database = graph.nodes.get(node_name)['database'] %} 46 | {% set table_name = (database + '.' + schema + '.' + name) | lower %} 47 | 48 | {% elif any_source %} 49 | {% set package_name = any_source[0][0] %} 50 | {% set name = any_source[0][1] %} 51 | {% set node_name = re_data.priv_full_name_from_depends(el.node, name) %} 52 | {% set schema = graph.sources.get(node_name)['schema'] %} 53 | {% set database = graph.sources.get(node_name)['database'] %} 54 | {% set table_name = (database + '.' + schema + '.' + name) | lower %} 55 | {% else %} 56 | {% set table_name = none %} 57 | {% endif %} 58 | {% else %} 59 | {% set table_name = none %} 60 | {% endif %} 61 | 62 | {% if var.has_var('re_data:query_test_failures') %} 63 | {% set query_failures = var('re_data:query_test_failures') %} 64 | {% else %} 65 | {% set query_failures = true %} 66 | {% endif %} 67 | 68 | {% if el.failures and el.failures > 0 and el.node.relation_name and query_failures %} 69 | {% if var.has_var('re_data:test_history_failures_limit') %} 70 | {% set limit_count = var('re_data:test_history_failures_limit')%} 71 | {% else %} 72 | {% set limit_count = 10 %} 73 | {% endif %} 74 | 75 | {% set failures_query %} 76 | select * from {{ el.node.relation_name}} limit {{ limit_count }} 77 | {% endset %} 78 | {% set failures_list = re_data.agate_to_list(run_query(failures_query)) %} 79 | {% endif %} 80 | 81 | {% set failures_json = none %} 82 | 83 | {{ return ({ 84 | 'table_name': table_name, 85 | 'column_name': el.node.column_name or none, 86 | 'test_name': el.node.name, 87 | 'status': el.status.name, 88 | 'execution_time': el.execution_time, 89 | 'message': el.message, 90 | 'failures_count': el.failures, 91 | 'failures_json': '' ~ failures_list, 92 | 'failures_table': el.node.relation_name or none, 93 | 'severity': el.node.config.severity, 94 | 'compiled_sql': el.node.compiled_sql or el.node.compiled_code or none, 95 | 'run_at': run_started_at_str, 96 | }) 97 | }} 98 | 99 | {% endmacro %} 100 | 101 | {% macro priv_full_name_from_depends(node, name) %} 102 | 103 | {% for full_name in node.depends_on.nodes %} 104 | {% set node_name = full_name.split('.')[-1] %} 105 | {% if node_name == name %} 106 | {{ return(full_name) }} 107 | {% endif %} 108 | {% endfor %} 109 | 110 | {{ return(none) }} 111 | 112 | {% endmacro %} 113 | -------------------------------------------------------------------------------- /macros/public/store/generate_overview.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro overview_select_base(type, timestamp_col) %} 3 | '{{ type }}' as {{ re_data.quote_column('type') }}, 4 | table_name as {{ re_data.quote_column('table_name') }}, 5 | column_name as {{ re_data.quote_column('column_name') }}, 6 | {{ timestamp_col }} as {{ re_data.quote_column('computed_on') }}, 7 | {% endmacro %} 8 | 9 | {% macro generate_overview(start_date, end_date, interval, overview_path=None, monitored_path=None) %} 10 | -- depends_on: {{ ref('re_data_anomalies') }} 11 | -- depends_on: {{ ref('re_data_base_metrics') }} 12 | -- depends_on: {{ ref('re_data_schema_changes') }} 13 | -- depends_on: {{ ref('re_data_columns') }} 14 | 15 | {# time grain is either days or hour #} 16 | {% set time_grain, num_str = interval.split(':') %} 17 | {% set num = num_str | int %} 18 | {% if time_grain == 'hours' %} 19 | {% set interval_length_sec = num * 3600 %} 20 | {% elif time_grain == 'days'%} 21 | {% set interval_length_sec = num * 3600 * 24 %} 22 | {% else %} 23 | {{ exceptions.raise_compiler_error("Invalid interval. Got: " ~ interval) }} 24 | {% endif %} 25 | {{ dbt_utils.log_info('[re_data] interval length in seconds is ' ~ interval_length_sec) }} 26 | {% set overview_query %} 27 | with schema_changes_casted as ( 28 | select id, table_name, operation, column_name, data_type, {{ bool_to_string('is_nullable') }}, prev_column_name, prev_data_type, {{ bool_to_string('prev_is_nullable') }}, detected_time 29 | from {{ ref('re_data_schema_changes') }} 30 | ), 31 | columns_casted as ( 32 | select {{ full_table_name('name', 'schema', 'database') }} as table_name, column_name, data_type, {{ bool_to_string('is_nullable') }}, computed_on 33 | from {{ ref('re_data_columns') }} 34 | ) 35 | 36 | ( 37 | select 38 | {{ overview_select_base('metric', 'computed_on')}} 39 | {{ to_single_json(['metric', 'value', 'time_window_end', 'interval_length_sec']) }} as {{ re_data.quote_column('data') }} 40 | from 41 | {{ ref('re_data_base_metrics') }} 42 | where {{ in_date_window('time_window_end', start_date, end_date) }} 43 | and interval_length_sec = {{interval_length_sec}} 44 | ) union all 45 | ( 46 | select 47 | {{ overview_select_base('anomaly', 'computed_on')}} 48 | {{ to_single_json(['id', 'metric', 'z_score_value', 'last_value', 'last_avg', 'last_stddev', 'time_window_end', 'interval_length_sec']) }} as {{ re_data.quote_column('data') }} 49 | from 50 | {{ ref('re_data_anomalies') }} 51 | where {{ in_date_window('time_window_end', start_date, end_date) }} 52 | and interval_length_sec = {{interval_length_sec}} 53 | ) union all 54 | ( 55 | select 56 | {{ overview_select_base('schema_change', 'detected_time')}} 57 | {{ to_single_json(['id', 'operation', 'data_type', 'is_nullable', 'prev_column_name', 'prev_data_type', 'prev_is_nullable', 'detected_time']) }} as {{ re_data.quote_column('data') }} 58 | from 59 | schema_changes_casted 60 | where {{ in_date_window('detected_time', start_date, none) }} 61 | ) union all 62 | ( 63 | select 64 | {{ overview_select_base('schema', 'computed_on')}} 65 | {{ to_single_json(['data_type', 'is_nullable']) }} as {{ re_data.quote_column('data') }} 66 | from 67 | columns_casted 68 | ) 69 | union all 70 | ( 71 | select 72 | 'alert' as {{ re_data.quote_column('type') }}, 73 | model as {{ re_data.quote_column('table_name') }}, 74 | null as {{ re_data.quote_column('column_name') }}, 75 | time_window_end as {{ re_data.quote_column('computed_on') }}, 76 | {{ to_single_json(['type', 'model', 'message', 'value', 'time_window_end']) }} as {{ re_data.quote_column('data') }} 77 | from 78 | {{ ref('re_data_alerts') }} 79 | where 80 | case 81 | when type = 'anomaly' then {{ in_date_window('time_window_end', start_date, end_date) }} 82 | else {{ in_date_window('time_window_end', start_date, none) }} 83 | end 84 | ) 85 | order by {{ re_data.quote_column('computed_on')}} desc 86 | {% endset %} 87 | 88 | {% set overview_result = run_query(overview_query) %} 89 | {% set overview_file_path = overview_path or '../target/re_data/overview.json' %} 90 | {% do overview_result.to_json(overview_file_path) %} 91 | {{ save_monitored(monitored_path) }} 92 | 93 | {% endmacro %} 94 | -------------------------------------------------------------------------------- /macros/tests/test_metrics.sql: -------------------------------------------------------------------------------- 1 | 2 | {% macro metric_expression(table, metric, expression, column_name=None, condition=None) %} 3 | select * from {{ref('re_data_base_metrics')}} 4 | where 5 | table_name = '{{ re_data.full_table_name_values(table.identifier, table.schema, table.database)}}' and 6 | metric = '{{ metric }}' and 7 | {% if condition is not none %} 8 | {{ condition }} and 9 | {% endif %} 10 | {% if column_name is none %} 11 | not ( {{ expression }} ) 12 | {% else %} 13 | column_name = '{{ column_name }}' and 14 | not ( {{ expression }} ) 15 | {% endif %} 16 | 17 | {% endmacro %} 18 | 19 | {# old test macros, will be removed after some time #} 20 | {% test metric_expression_is_true(model, table, metric, expression, column_name=None, condition=None) %} 21 | {{ re_data.metric_expression(table, metric, expression, column_name=None, condition=None) }} 22 | {% endtest %} 23 | 24 | 25 | {% test metric_equal_to(model, table, metric, value, column_name=None, condition=None) %} 26 | {{ re_data.metric_expression(table, metric, 'value = ' ~ value, column_name, condition) }} 27 | {% endtest %} 28 | 29 | 30 | {% test metric_in_range(model, table, metric, min_value, max_value, column_name=None, condition=None) %} 31 | {{ re_data.metric_expression(table, metric, 'value >= ' ~ min_value ~ ' and value <= ' ~ max_value, column_name, condition) }} 32 | {% endtest %} 33 | 34 | {# new test macros #} 35 | 36 | {% test assert_true(model, column_name=None, metric=None, expression=expression, condition=None) %} 37 | -- depends_on: {{ ref('re_data_base_metrics') }} 38 | {% if execute %} 39 | {{ re_data.metric_expression(model, metric, expression, column_name, condition) }} 40 | {% else %} 41 | {{ re_data.empty_table() }} 42 | {% endif %} 43 | {% endtest %} 44 | 45 | {% test assert_false(model, column_name=None, metric=None, expression=expression, condition=None) %} 46 | -- depends_on: {{ ref('re_data_base_metrics') }} 47 | {% if execute %} 48 | {{ re_data.metric_expression(model, metric, 'not (' ~ expression ~ ')', column_name, condition) }} 49 | {% else %} 50 | {{ re_data.empty_table() }} 51 | {% endif %} 52 | {% endtest %} 53 | 54 | {% test assert_in_range(model, column_name=None, metric=None, min_value=None, max_value=None, condition=None) %} 55 | -- depends_on: {{ ref('re_data_base_metrics') }} 56 | {% if execute %} 57 | {{ re_data.metric_expression(model, metric, 'value >= ' ~ min_value ~ ' and value <= ' ~ max_value, column_name, condition) }} 58 | {% else %} 59 | {{ re_data.empty_table() }} 60 | {% endif %} 61 | {% endtest %} 62 | 63 | {% test assert_equal(model, column_name=None, metric=None, value=value, condition=None) %} 64 | -- depends_on: {{ ref('re_data_base_metrics') }} 65 | {% if execute %} 66 | {{ re_data.metric_expression(model, metric, 'value = ' ~ value, column_name, condition) }} 67 | {% else %} 68 | {{ re_data.empty_table() }} 69 | {% endif %} 70 | {% endtest %} 71 | 72 | {% test assert_greater(model, column_name=None, metric=None, value=None, condition=None) %} 73 | -- depends_on: {{ ref('re_data_base_metrics') }} 74 | {% if execute %} 75 | {{ re_data.metric_expression(model, metric, 'value > ' ~ value, column_name, condition) }} 76 | {% else %} 77 | {{ re_data.empty_table() }} 78 | {% endif %} 79 | {% endtest %} 80 | 81 | {% test assert_greater_equal(model, column_name=None, metric=None, value=None, condition=None) %} 82 | -- depends_on: {{ ref('re_data_base_metrics') }} 83 | {% if execute %} 84 | {{ re_data.metric_expression(model, metric, 'value >= ' ~ value, column_name, condition) }} 85 | {% else %} 86 | {{ re_data.empty_table() }} 87 | {% endif %} 88 | {% endtest %} 89 | 90 | {% test assert_less(model, column_name=None, metric=None, value=None, condition=None) %} 91 | -- depends_on: {{ ref('re_data_base_metrics') }} 92 | {% if execute %} 93 | {{ re_data.metric_expression(model, metric, 'value < ' ~ value, column_name, condition) }} 94 | {% else %} 95 | {{ re_data.empty_table() }} 96 | {% endif %} 97 | {% endtest %} 98 | 99 | {% test assert_less_equal(model, column_name=None, metric=None, value=None, condition=None) %} 100 | -- depends_on: {{ ref('re_data_base_metrics') }} 101 | {% if execute %} 102 | {{ re_data.metric_expression(model, metric, 'value <= ' ~ value, column_name, condition) }} 103 | {% else %} 104 | {{ re_data.empty_table() }} 105 | {% endif %} 106 | {% endtest %} -------------------------------------------------------------------------------- /.github/workflows/run-db-tests.yml: -------------------------------------------------------------------------------- 1 | name: Run package tests for all dbs 2 | 3 | on: [push, delete] 4 | 5 | env: 6 | DBT_PROFILES_DIR: ${{ github.workspace }}/ 7 | SNOWFLAKE_RE_DATA_TESTING_ACCOUNT: ${{ secrets.SNOWFLAKE_RE_DATA_TESTING_ACCOUNT }} 8 | RE_DATA_TESTING_USER: ${{ secrets.RE_DATA_TESTING_USER }} 9 | RE_DATA_TESTING_PASSWORD: ${{ secrets.RE_DATA_TESTING_PASSWORD }} 10 | REDSHIFT_RE_DATA_TESTING_HOST: ${{ secrets.REDSHIFT_RE_DATA_TESTING_HOST }} 11 | BIGQUERY_TESTING_TYPE: ${{ secrets.BIGQUERY_TESTING_TYPE }} 12 | BIGQUERY_TESTING_PROJECT_ID: ${{ secrets.BIGQUERY_TESTING_PROJECT_ID }} 13 | BIGQUERY_TESTING_PRIVATE_KEY_ID: ${{ secrets.BIGQUERY_TESTING_PRIVATE_KEY_ID }} 14 | BIGQUERY_TESTING_PRIVATE_KEY: ${{ secrets.BIGQUERY_TESTING_PRIVATE_KEY }} 15 | BIGQUERY_TESTING_CLIENT_EMAIL: ${{ secrets.BIGQUERY_TESTING_CLIENT_EMAIL }} 16 | BIGQUERY_TESTING_CLIENT_ID: ${{ secrets.BIGQUERY_TESTING_CLIENT_ID }} 17 | BIGQUERY_TESTING_AUTH_URI: ${{ secrets.BIGQUERY_TESTING_AUTH_URI }} 18 | BIGQUERY_TESTING_TOKEN_URI: ${{ secrets.BIGQUERY_TESTING_TOKEN_URI }} 19 | BIGQUERY_TESTING_AUTH_PROVIDER_X509_CERT_URL: ${{ secrets.BIGQUERY_TESTING_AUTH_PROVIDER_X509_CERT_URL }} 20 | BIGQUERY_TESTING_CLIENT_X509_CERT_URL: ${{ secrets.BIGQUERY_TESTING_CLIENT_X509_CERT_URL }} 21 | DBT_VERSION: 1.7 22 | PYTHON_VERSION: "3.8.x" 23 | 24 | jobs: 25 | test-postgres: 26 | runs-on: ubuntu-latest 27 | if: github.event_name == 'push' 28 | services: 29 | postgres: 30 | image: postgres 31 | env: 32 | POSTGRES_PASSWORD: postgres 33 | # Set health checks to wait until postgres has started 34 | options: >- 35 | --health-cmd pg_isready 36 | --health-interval 10s 37 | --health-timeout 5s 38 | --health-retries 5 39 | ports: 40 | # Maps tcp port 5432 on service container to the host 41 | - 5432:5432 42 | steps: 43 | - name: Check out 44 | uses: actions/checkout@v2 45 | 46 | - uses: actions/setup-python@v4 47 | with: 48 | python-version: ${{ env.PYTHON_VERSION }} 49 | 50 | - name: Install dependencies 51 | working-directory: ./integration_tests 52 | run: | 53 | pip install -r requirements.txt 54 | pip install dbt-postgres==$DBT_VERSION 55 | dbt deps 56 | 57 | - name: Test DB 58 | working-directory: ./integration_tests/python_tests 59 | run: pytest --db postgres --source_schema dq 60 | 61 | test-other-dbs: 62 | runs-on: ubuntu-latest 63 | if: github.event_name == 'push' && github.repository == 're-data/dbt-re-data' && github.ref == 'refs/heads/main' 64 | strategy: 65 | fail-fast: false 66 | matrix: 67 | database: [snowflake, bigquery, redshift] 68 | steps: 69 | - name: Check out 70 | uses: actions/checkout@v2 71 | 72 | - uses: actions/setup-python@v4 73 | with: 74 | python-version: ${{ env.PYTHON_VERSION }} 75 | 76 | - name: Inject slug/short variables 77 | uses: rlespinasse/github-slug-action@v3.x 78 | 79 | - name: Set the DQ_SCHEMA environment variable 80 | shell: bash 81 | run: | 82 | echo "DQ_SCHEMA=dq_${GITHUB_REF_SLUG//[^[:alnum:]]/_}" >> $GITHUB_ENV 83 | 84 | - name: Print DQ_SCHEMA 85 | run: | 86 | echo $DQ_SCHEMA 87 | 88 | - name: Install dependencies 89 | working-directory: ./integration_tests 90 | run: | 91 | pip install -r requirements.txt 92 | pip install dbt-${{ matrix.database }}==$DBT_VERSION 93 | dbt deps 94 | 95 | - name: Drop schemas 96 | working-directory: ./integration_tests 97 | run: | 98 | dbt run-operation drop_all_schemas --args "{ schema_name: ${{ env.DQ_SCHEMA }} }" --profile re_data_${{ matrix.database }} --vars "{ source_schema: ${{ env.DQ_SCHEMA }} }" 99 | 100 | - name: Create Schemas if needed 101 | if: matrix.database == 'redshift' 102 | working-directory: ./integration_tests 103 | run: | 104 | dbt run-operation create_required_schemas --args "{ schema_name: ${{ env.DQ_SCHEMA }} }" --profile re_data_${{ matrix.database }} --vars "{ source_schema: ${{ env.DQ_SCHEMA }} }" 105 | 106 | - name: Test DB 107 | working-directory: ./integration_tests/python_tests 108 | run: | 109 | pytest --db ${{ matrix.database }} --source_schema ${{ env.DQ_SCHEMA }} 110 | 111 | clean-up-schemas: 112 | runs-on: ubuntu-latest 113 | if: github.event_name == 'delete' && github.repository == 're-data/dbt-re-data' && github.ref == 'refs/heads/main' 114 | strategy: 115 | fail-fast: false 116 | matrix: 117 | database: [snowflake, bigquery, redshift] 118 | steps: 119 | - name: Check out 120 | uses: actions/checkout@v2 121 | 122 | - uses: actions/setup-python@v4 123 | with: 124 | python-version: ${{ env.PYTHON_VERSION }} 125 | 126 | - name: Inject slug/short variables 127 | uses: rlespinasse/github-slug-action@v3.x 128 | 129 | - name: Set the DQ_SCHEMA environment variable 130 | shell: bash 131 | run: | 132 | echo "DQ_SCHEMA=dq_${GITHUB_EVENT_REF_SLUG//[^[:alnum:]]/_}" >> $GITHUB_ENV 133 | 134 | - name: Print DQ_SCHEMA 135 | run: | 136 | echo $DQ_SCHEMA 137 | 138 | - name: Install dependencies and drop branch schema 139 | working-directory: ./integration_tests 140 | run: | 141 | pip install -r requirements.txt 142 | pip install dbt-${{ matrix.database }}==$DBT_VERSION 143 | dbt deps 144 | dbt run-operation drop_all_schemas --args "{ schema_name: ${{ env.DQ_SCHEMA }} }" --profile re_data_${{ matrix.database }} --vars "{ source_schema: ${{ env.DQ_SCHEMA }} }" -------------------------------------------------------------------------------- /models/alerts/re_data_schema_changes.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key = 'id', 5 | on_schema_change='sync_all_columns', 6 | ) 7 | }} 8 | 9 | -- depends_on: {{ ref('re_data_run_started_at') }} 10 | -- depends_on: {{ ref('re_data_columns_over_time') }} 11 | -- depends_on: {{ ref('re_data_monitored') }} 12 | -- depends_on: {{ ref('re_data_selected') }} 13 | 14 | {% if execute and not re_data.in_compile() %} 15 | {% set last_data_points %} 16 | select 17 | distinct detected_time 18 | from {{ ref('re_data_columns_over_time') }} 19 | order by 20 | detected_time desc limit 2; 21 | {% endset %} 22 | 23 | {% set detected_times = run_query(last_data_points) %} 24 | 25 | {% set times_list = detected_times.columns[0].values() %} 26 | {% set most_recent_time = times_list[0] %} 27 | 28 | {% if times_list | length > 1 %} 29 | {% set prev_most_recent = times_list[1] %} 30 | {% else %} 31 | {% set prev_most_recent = times_list[0] %} 32 | {% endif %} 33 | {% else %} 34 | {% set times_list = () %} 35 | {% endif %} 36 | 37 | {% if times_list == () %} 38 | {{ 39 | re_data.empty_table_generic([ 40 | ('id', 'string'), 41 | ('table_name', 'string'), 42 | ('operation', 'string'), 43 | ('column_name', 'string'), 44 | ('data_type', 'string'), 45 | ('is_nullable', 'boolean'), 46 | ('prev_column_name', 'string'), 47 | ('prev_data_type', 'string'), 48 | ('prev_is_nullable', 'boolean'), 49 | ('detected_time', 'timestamp') 50 | ]) 51 | }} 52 | {% else %} 53 | 54 | with curr_monitored_schema as ( 55 | select * from {{ ref('re_data_columns_over_time')}} 56 | where detected_time = cast('{{ most_recent_time }}' as {{ timestamp_type() }}) 57 | and table_name in ( 58 | select {{ full_table_name('name', 'schema', 'database') }} from {{ ref('re_data_selected')}} 59 | ) 60 | ), 61 | 62 | 63 | prev_monitored_schema as ( 64 | select * from {{ ref('re_data_columns_over_time')}} 65 | where detected_time = cast('{{ prev_most_recent}}' as {{ timestamp_type() }}) 66 | and table_name in ( 67 | select {{ full_table_name('name', 'schema', 'database') }} from {{ ref('re_data_selected')}} 68 | ) 69 | ), 70 | 71 | all_changes as ( 72 | ( 73 | select 74 | curr.table_name as table_name, 75 | 'type_change' as operation, 76 | curr.column_name as column_name, 77 | curr.data_type as data_type, 78 | curr.is_nullable as is_nullable, 79 | 80 | prev.column_name as prev_column_name, 81 | prev.data_type as prev_data_type, 82 | prev.is_nullable as prev_is_nullable 83 | 84 | from curr_monitored_schema curr inner join prev_monitored_schema prev on (curr.table_name = prev.table_name and curr.column_name = prev.column_name) 85 | where 86 | curr.data_type != prev.data_type or 87 | curr.is_nullable != prev.is_nullable 88 | ) 89 | 90 | union all 91 | 92 | ( 93 | 94 | select 95 | curr.table_name as table_name, 96 | 'column_added' as operation, 97 | curr.column_name as column_name, 98 | curr.data_type as data_type, 99 | curr.is_nullable as is_nullable, 100 | 101 | null as prev_column_name, 102 | null as prev_data_type, 103 | null as prev_is_nullable 104 | 105 | from curr_monitored_schema curr left join prev_monitored_schema prev on (curr.table_name = prev.table_name and curr.column_name = prev.column_name) 106 | where prev.table_name is null and prev.column_name is null 107 | {# note: when a column is added, make sure we only detect for models that were previously monitored, 108 | this avoids a situation where a newly monitored model has all its columns detected with 'column_added' operation#} 109 | and curr.table_name in ( 110 | select table_name from prev_monitored_schema 111 | ) 112 | 113 | ) 114 | 115 | union all 116 | 117 | ( 118 | 119 | select 120 | prev.table_name as table_name, 121 | 'column_removed' as operation, 122 | null as column_name, 123 | null as data_type, 124 | null as is_nullable, 125 | 126 | prev.column_name as prev_column_name, 127 | prev.data_type as prev_data_type, 128 | prev.is_nullable as prev_is_nullable 129 | 130 | from prev_monitored_schema prev left join curr_monitored_schema curr on (curr.table_name = prev.table_name and curr.column_name = prev.column_name) 131 | where curr.table_name is null and curr.column_name is null 132 | 133 | ) 134 | ), 135 | 136 | all_with_time as ( 137 | select 138 | all_changes.table_name, 139 | all_changes.operation, 140 | all_changes.column_name, 141 | all_changes.data_type, 142 | all_changes.is_nullable, 143 | all_changes.prev_column_name, 144 | all_changes.prev_data_type, 145 | all_changes.prev_is_nullable, 146 | cast({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as detected_time 147 | from all_changes 148 | ) 149 | 150 | select 151 | cast ({{ dbt_utils.generate_surrogate_key([ 152 | 'table_name', 153 | 'column_name', 154 | 'detected_time' 155 | ]) }} as {{ string_type() }} ) as id, 156 | table_name, 157 | cast (operation as {{ string_type() }}) as operation, 158 | column_name, 159 | data_type, 160 | is_nullable, 161 | prev_column_name, 162 | prev_data_type, 163 | prev_is_nullable, 164 | detected_time 165 | from all_with_time 166 | 167 | {% endif %} 168 | -------------------------------------------------------------------------------- /integration_tests/seeds/monitoring/expected_z_score.csv: -------------------------------------------------------------------------------- 1 | table_name,column_name,metric,time_window_end,z_score_value,modified_z_score_value,last_value,last_avg,last_stddev,last_median,last_iqr,last_median_absolute_deviation,last_mean_absolute_deviation,interval_length_sec 2 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__avg_length,2021-05-03 00:00:00,0,0,14105,14105,0,14105,0,0,0,86400 3 | SAMPLE_TABLE,EVENT_TYPE,distinct_values,2021-05-03 00:00:00,-707,-674,1000,1500,707,1500,500,500,500,86400 4 | BUY_EVENTS,EVENT_TYPE,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 5 | SAMPLE_TABLE,EVENT_TYPE,max_length,2021-05-03 00:00:00,-707,-674,3000,3500,707,3500,500,500,500,86400 6 | SAMPLE_TABLE,EVENT_TYPE,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 7 | SAMPLE_TABLE,EVENT_TYPE,min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 8 | SAMPLE_TABLE,EVENT_TYPE,missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 9 | SAMPLE_TABLE,NULL_VALUE,nulls_percent,2021-05-03 00:00:00,0,0,100000,100000,0,100000,0,0,0,86400 10 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__avg_length,2021-05-03 00:00:00,0,0,3053,3053,0,3053,0,0,0,86400 11 | BUY_EVENTS,VALUE1,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 12 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 13 | SAMPLE_TABLE,---,freshness,2021-05-03 00:00:00,0,0,40765000,40765000,0,40765000,0,0,0,86400 14 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__stddev,2021-05-03 00:00:00,0,0,3028,3028,0,3028,0,0,0,86400 15 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 16 | BUY_EVENTS,EVENT_TYPE,min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 17 | SAMPLE_TABLE,VALUE1,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 18 | BUY_EVENTS,VALUE2,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 19 | SAMPLE_TABLE,VALUE1,variance,2021-05-03 00:00:00,707,674,2491667,2425000,94281,2425000,66667,66667,66667,86400 20 | RE_DATA_SOURCE_TEST_TABLE,---,global__row_count,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400 21 | BUY_EVENTS,---,my_distinct_table_rows,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400 22 | SAMPLE_WITHOUT_TIME_FILTER,---,global__row_count,2021-05-03 00:00:00,0,0,19000,19000,0,19000,0,0,0,86400 23 | SAMPLE_TABLE,EVENT_TYPE,match_regex,2021-05-03 00:00:00,-707,-674,0,500,707,500,500,500,500,86400 24 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__diff,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400 25 | BUY_EVENTS,VALUE2,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 26 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__avg,2021-05-03 00:00:00,0,0,5500,5500,0,5500,0,0,0,86400 27 | SAMPLE_TABLE,VALUE2,min,2021-05-03 00:00:00,0,0,109000,109000,0,109000,0,0,0,86400 28 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__max_length,2021-05-03 00:00:00,0,0,17000,17000,0,17000,0,0,0,86400 29 | SAMPLE_WITH_ANOMALY,VALUE1,min,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400 30 | SAMPLE_WITH_ANOMALY,VALUE2,max,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400 31 | SAMPLE_WITH_ANOMALY,VALUE2,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 32 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex_percent,2021-05-03 00:00:00,-707,-674,0,12500,17678,12500,12500,12500,12500,86400 33 | SAMPLE_TABLE,---,distinct_table_rows,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400 34 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 35 | SAMPLE_WITH_ANOMALY,VALUE2,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 36 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 37 | SAMPLE_TABLE,VALUE2,stddev,2021-05-03 00:00:00,0,0,47975,47975,0,47975,0,0,0,86400 38 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__min,2021-05-03 00:00:00,0,0,990,990,0,990,0,0,0,86400 39 | BUY_EVENTS,VALUE2,max,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400 40 | SAMPLE_WITHOUT_TIME_FILTER,---,global__my_distinct_table_rows,2021-05-03 00:00:00,0,0,19000,19000,0,19000,0,0,0,86400 41 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__avg_length,2021-05-03 00:00:00,0,0,3900,3900,0,3900,0,0,0,86400 42 | SAMPLE_WITH_ANOMALY,VALUE1,avg,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400 43 | SAMPLE_TABLE,VALUE2,variance,2021-05-03 00:00:00,0,0,2301583,2301583,0,2301583,0,0,0,86400 44 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 45 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 46 | BUY_EVENTS,VALUE1,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 47 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__max,2021-05-03 00:00:00,0,0,4990,4990,0,4990,0,0,0,86400 48 | SAMPLE_WITH_ANOMALY,---,my_distinct_table_rows,2021-05-03 00:00:00,0,0,12000,12000,0,12000,0,0,0,86400 49 | BUY_EVENTS,VALUE1,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 50 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__variance,2021-05-03 00:00:00,0,0,2784,2784,0,2784,0,0,0,86400 51 | SAMPLE_TABLE,EVENT_TYPE,duplicate_values,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400 52 | SAMPLE_TABLE,NULL_VALUE,nulls_count,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400 53 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 54 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 55 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__variance,2021-05-03 00:00:00,0,0,9167,9167,0,9167,0,0,0,86400 56 | SAMPLE_TABLE,EVENT_TYPE,avg_length,2021-05-03 00:00:00,-707,-674,3000,3125,177,3125,125,125,125,86400 57 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__min,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400 58 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 59 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__avg,2021-05-03 00:00:00,0,0,3306,3306,0,3306,0,0,0,86400 60 | BUY_EVENTS,VALUE1,avg,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400 61 | SAMPLE_WITH_ANOMALY,VALUE2,min,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400 62 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 63 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex,2021-05-03 00:00:00,-707,-674,0,500,707,500,500,500,500,86400 64 | BUY_EVENTS,VALUE1,max,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400 65 | SAMPLE_TABLE,VALUE1,stddev,2021-05-03 00:00:00,707,674,49917,49240,957,49240,677,677,677,86400 66 | BUY_EVENTS,VALUE1,min,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400 67 | SAMPLE_WITH_ANOMALY,VALUE2,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 68 | SAMPLE_TABLE,---,my_custom_table_metric,2021-05-03 00:00:00,0,0,1000000,1000000,0,1000000,0,0,0,86400 69 | BUY_EVENTS,EVENT_TYPE,max_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 70 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,avg_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 71 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 72 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 73 | SAMPLE_WITH_ANOMALY,VALUE1,max,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400 74 | SAMPLE_TABLE,---,my_distinct_table_rows,2021-05-03 00:00:00,0,0,9000,9000,0,9000,0,0,0,86400 75 | SAMPLE_TABLE,---,row_count,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400 76 | SAMPLE_TABLE,VALUE2,max,2021-05-03 00:00:00,0,0,209000,209000,0,209000,0,0,0,86400 77 | SAMPLE_WITH_ANOMALY,VALUE1,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 78 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 79 | BUY_EVENTS,EVENT_TYPE,avg_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 80 | SAMPLE_TABLE,VALUE1,diff,2021-05-03 00:00:00,707,674,110000,105000,7071,105000,5000,5000,5000,86400 81 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 82 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 83 | SAMPLE_TABLE,VALUE1,min,2021-05-03 00:00:00,0,0,100000,100000,0,100000,0,0,0,86400 84 | SAMPLE_TABLE,EVENT_TYPE,regex_test,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400 85 | SAMPLE_TABLE,EVENT_TYPE,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 86 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__min_length,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400 87 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__min_length,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400 88 | SAMPLE_WITH_ANOMALY,VALUE1,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 89 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__max,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400 90 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 91 | SAMPLE_TABLE,VALUE2,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 92 | SAMPLE_WITH_ANOMALY,VALUE2,avg,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400 93 | BUY_EVENTS,---,freshness,2021-05-03 00:00:00,-707,-674,41065000,41186500,171827,41186500,121500,121500,121500,86400 94 | BUY_EVENTS,VALUE2,avg,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400 95 | BUY_EVENTS,---,row_count,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400 96 | SAMPLE_WITH_ANOMALY,---,freshness,2021-05-03 00:00:00,-707,-674,41065000,41186500,171827,41186500,121500,121500,121500,86400 97 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__max_length,2021-05-03 00:00:00,0,0,5000,5000,0,5000,0,0,0,86400 98 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 99 | SAMPLE_TABLE,EVENT_TYPE,match_regex_percent,2021-05-03 00:00:00,-707,-674,0,12500,17678,12500,12500,12500,12500,86400 100 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,max_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 101 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__stddev,2021-05-03 00:00:00,0,0,1668,1668,0,1668,0,0,0,86400 102 | SAMPLE_TABLE,VALUE1,max,2021-05-03 00:00:00,707,674,210000,205000,7071,205000,5000,5000,5000,86400 103 | SAMPLE_TABLE,VALUE2,avg,2021-05-03 00:00:00,0,0,180750,180750,0,180750,0,0,0,86400 104 | SAMPLE_WITH_ANOMALY,VALUE1,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 105 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400 106 | SAMPLE_TABLE,VALUE1,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 107 | BUY_EVENTS,EVENT_TYPE,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 108 | BUY_EVENTS,EVENT_TYPE,missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 109 | BUY_EVENTS,EVENT_TYPE,missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 110 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 111 | BUY_EVENTS,VALUE2,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 112 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__diff,2021-05-03 00:00:00,0,0,9000,9000,0,9000,0,0,0,86400 113 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 114 | BUY_EVENTS,VALUE2,min,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400 115 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__max_length,2021-05-03 00:00:00,0,0,5000,5000,0,5000,0,0,0,86400 116 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 117 | SAMPLE_TABLE,VALUE2,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 118 | SAMPLE_TABLE,EVENT_TYPE,duplicate_rows,2021-05-03 00:00:00,707,674,4000,3500,707,3500,500,500,500,86400 119 | RE_DATA_SOURCE_TEST_TABLE,---,global__my_distinct_table_rows,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400 120 | SAMPLE_TABLE,VALUE2,diff,2021-05-03 00:00:00,0,0,100000,100000,0,100000,0,0,0,86400 121 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 122 | SAMPLE_WITH_ANOMALY,---,row_count,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400 123 | SAMPLE_TABLE,EVENT_TYPE,unique_rows,2021-05-03 00:00:00,-707,-674,0,500,707,500,500,500,500,86400 124 | SAMPLE_TABLE,VALUE1,avg,2021-05-03 00:00:00,707,674,142500,135000,10607,135000,7500,7500,7500,86400 125 | SAMPLE_TABLE,EVENT_TYPE,missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400 126 | --------------------------------------------------------------------------------