├── data
    ├── .gitkeep
    └── .DS_Store
├── analysis
    └── .gitkeep
├── macros
    ├── .gitkeep
    ├── db
    │   ├── postgres
    │   │   ├── escape.sql
    │   │   ├── quote_string.sql
    │   │   └── postgres_type_db.sql
    │   ├── redshift
    │   │   ├── escape.sql
    │   │   └── quote_string.sql
    │   ├── bigquery
    │   │   ├── quote_string.sql
    │   │   ├── quote_column_name.sql
    │   │   └── split_and_return_nth_value.sql
    │   ├── core
    │   │   ├── escape.sql
    │   │   ├── identifier_mapping.sql
    │   │   ├── quote_column_name.sql
    │   │   ├── quote_string.sql
    │   │   └── split_and_return_nth_value.sql
    │   └── snowflake
    │   │   └── identifier_mapping.sql
    ├── public
    │   ├── cleaning
    │   │   ├── clean_capitalize_words.sql
    │   │   ├── clean_additional_whitespace.sql
    │   │   └── clean_blacklist.sql
    │   ├── filtering
    │   │   ├── remove_duplicates.sql
    │   │   └── get_duplicates.sql
    │   ├── store
    │   │   ├── export_table_samples.sql
    │   │   ├── export_tests_history.sql
    │   │   ├── export_alerts.sql
    │   │   └── generate_overview.sql
    │   ├── normalizing
    │   │   └── normalize_values.sql
    │   └── validating
    │   │   ├── valid_with_regex.sql
    │   │   └── regex_dict.sql
    ├── utils
    │   ├── for_loops.sql
    │   ├── depends_macro.sql
    │   ├── comma_delimited_list.sql
    │   ├── comparison_text.sql
    │   ├── bool_to_string.sql
    │   ├── is_list.sql
    │   ├── formulas.sql
    │   ├── quote.sql
    │   ├── dict_from_list.sql
    │   ├── get_database.sql
    │   ├── in_compile.sql
    │   ├── regular_expression.sql
    │   ├── deduplication
    │   │   └── add_duplication_context.sql
    │   ├── json
    │   │   └── to_single_json.sql
    │   ├── agate
    │   │   └── row_value.sql
    │   ├── fivetran_utils
    │   │   ├── json_extract.sql
    │   │   └── percentile.sql
    │   ├── monitored_config.sql
    │   ├── mock
    │   │   └── empty_tables.sql
    │   ├── used_types.sql
    │   ├── column_types.sql
    │   ├── generate_alert_message.sql
    │   └── time_macros.sql
    ├── meta
    │   ├── monitored_model_queries.sql
    │   ├── save_monitored.sql
    │   ├── information_schema.sql
    │   ├── table_name.sql
    │   └── get_monitored.sql
    ├── metrics
    │   └── base
    │   │   ├── build_in
    │   │       ├── optional_table_metrics.sql
    │   │       ├── table_default.sql
    │   │       ├── column_default.sql
    │   │       └── optional_column_metrics.sql
    │   │   ├── internal_model_template.sql
    │   │   ├── queries.sql
    │   │   └── expression.sql
    ├── post_hook
    │   └── re_data_monitored.sql
    ├── store
    │   └── insert_list_to_table.sql
    ├── config
    │   └── get_model_config.sql
    ├── samples
    │   └── internal_model_template.sql
    ├── run_end
    │   └── save_results_history.sql
    └── tests
    │   └── test_metrics.sql
├── snapshots
    └── .gitkeep
├── tests
    └── .gitkeep
├── integration_tests
    ├── macros
    │   ├── .gitkeep
    │   ├── trigger_schema_change.sql
    │   ├── test_utils.sql
    │   ├── my_metrics.sql
    │   ├── create_test_source_tables.sql
    │   └── drop_all_schemas.sql
    ├── seeds
    │   ├── .gitkeep
    │   ├── public_macros
    │   │   ├── validating
    │   │   │   ├── validate_emails.csv
    │   │   │   ├── expected_validated_emails.csv
    │   │   │   ├── validate_numbers.csv
    │   │   │   ├── validate_date_and_time.csv
    │   │   │   ├── validate_uuid.csv
    │   │   │   ├── expected_validated_uuids.csv
    │   │   │   ├── expected_validated_credit_cards.csv
    │   │   │   ├── expected_validated_numbers.csv
    │   │   │   ├── expected_validated_date_and_time.csv
    │   │   │   ├── validate_ip.csv
    │   │   │   └── expected_validated_ips.csv
    │   │   ├── filtering
    │   │   │   ├── duplicated.csv
    │   │   │   ├── expected_duplicates.csv
    │   │   │   └── expected_deduplicated.csv
    │   │   ├── normalizing
    │   │   │   ├── abbreviated_us_states.csv
    │   │   │   ├── us_states_normalization.csv
    │   │   │   └── expected_us_states_normalized.csv
    │   │   └── cleaning
    │   │   │   ├── expected_sample_user_data.csv
    │   │   │   └── sample_user_data.csv
    │   └── monitoring
    │   │   ├── expected_table_samples.csv
    │   │   ├── sample_table.csv
    │   │   ├── sample_with_anomaly.csv
    │   │   ├── sample_without_time_filter.csv
    │   │   ├── expected_test_history.csv
    │   │   ├── expected_anomalies.csv
    │   │   └── expected_z_score.csv
    ├── tests
    │   └── .gitkeep
    ├── analysis
    │   └── .gitkeep
    ├── snapshots
    │   └── .gitkeep
    ├── python_tests
    │   ├── __init__.py
    │   ├── utils
    │   │   ├── __init__.py
    │   │   └── run.py
    │   ├── run_all_dbs.sh
    │   ├── conftest.py
    │   ├── test_cleaners.py
    │   ├── test_normalizers.py
    │   ├── test_validate.py
    │   ├── test_filters.py
    │   └── test_monitoring.py
    ├── requirements.txt
    ├── README.md
    ├── .gitignore
    ├── packages.yml
    ├── models
    │   ├── public_macros
    │   │   ├── filtering
    │   │   │   ├── duplicates.sql
    │   │   │   ├── schema.yml
    │   │   │   └── deduplicated.sql
    │   │   ├── cleaning
    │   │   │   ├── schema.yml
    │   │   │   └── sanitized_user_data.sql
    │   │   ├── normalizing
    │   │   │   ├── schema.yml
    │   │   │   └── us_states_normalized.sql
    │   │   └── validating
    │   │   │   ├── validated_emails.sql
    │   │   │   ├── validated_uuids.sql
    │   │   │   ├── validated_ips.sql
    │   │   │   ├── schema.yml
    │   │   │   ├── validated_numbers.sql
    │   │   │   └── validated_date_and_time.sql
    │   ├── monitoring
    │   │   ├── test_re_data_anomalies.sql
    │   │   ├── test_re_data_metrics.sql
    │   │   ├── test_re_data_test_history.sql
    │   │   ├── test_re_data_table_samples.sql
    │   │   ├── test_re_data_z_score.sql
    │   │   └── schema.yml
    │   ├── transformed
    │   │   ├── buy_events.sql
    │   │   └── schema.yml
    │   ├── sources
    │   │   └── schema.yml
    │   └── metrics
    │   │   └── re_data_metrics.yml
    ├── pytest.ini
    └── dbt_project.yml
├── models
    ├── internal
    │   ├── samples
    │   │   ├── re_data_last_table_samples.sql
    │   │   └── re_data_last_table_samples_part.sql
    │   ├── metrics
    │   │   └── base
    │   │   │   ├── re_data_last_base_metrics_part0.sql
    │   │   │   ├── re_data_last_base_metrics_part1.sql
    │   │   │   ├── re_data_last_base_metrics_part2.sql
    │   │   │   ├── re_data_last_base_metrics_part3.sql
    │   │   │   ├── re_data_last_base_metrics_thread0.sql
    │   │   │   ├── re_data_last_base_metrics_thread1.sql
    │   │   │   ├── re_data_last_base_metrics_thread2.sql
    │   │   │   └── re_data_last_base_metrics_thread3.sql
    │   └── re_data_run_started_at.sql
    ├── metrics
    │   ├── final
    │   │   └── re_data_metrics.sql
    │   ├── for_anomalies
    │   │   ├── re_data_last_metrics.sql
    │   │   └── re_data_last_stats.sql
    │   └── types
    │   │   ├── samples
    │   │       └── re_data_table_samples.sql
    │   │   ├── schema
    │   │       └── re_data_columns_over_time.sql
    │   │   └── base
    │   │       └── re_data_base_metrics.sql
    ├── meta
    │   ├── re_data_selected.sql
    │   ├── re_data_monitored.sql
    │   └── re_data_columns.sql
    ├── alerts
    │   ├── re_data_test_runs.sql
    │   ├── re_data_alerts.sql
    │   ├── re_data_z_score.sql
    │   ├── re_data_anomalies.sql
    │   └── re_data_schema_changes.sql
    └── logs
    │   └── re_data_test_history.sql
├── static
    └── lineage_graph.png
├── packages.yml
├── .gitignore
├── .github
    ├── pull_request_template.md
    ├── ISSUE_TEMPLATE
    │   ├── documentation-request.md
    │   ├── bug_report.md
    │   └── feature_request.md
    └── workflows
    │   ├── generate-docs.yml
    │   └── run-db-tests.yml
├── Makefile
├── README.md
├── LICENSE
├── dbt_project.yml
└── profiles.yml


/data/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/analysis/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/tests/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/analysis/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/data/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/re-data/dbt-re-data/HEAD/data/.DS_Store


--------------------------------------------------------------------------------
/models/internal/samples/re_data_last_table_samples.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_table_samples() }}


--------------------------------------------------------------------------------
/models/metrics/final/re_data_metrics.sql:
--------------------------------------------------------------------------------
1 | select * from {{ ref('re_data_base_metrics') }}


--------------------------------------------------------------------------------
/integration_tests/requirements.txt:
--------------------------------------------------------------------------------
1 | protobuf==4.25.3
2 | pytest==6.2.5
3 | pyyaml==6.0
4 | 


--------------------------------------------------------------------------------
/integration_tests/README.md:
--------------------------------------------------------------------------------
1 | 
2 | dbt project for running dbt_re_data integration tests
3 | 
4 | 


--------------------------------------------------------------------------------
/macros/db/postgres/escape.sql:
--------------------------------------------------------------------------------
1 | {% macro postgres__escape_seq_for_json(chr) %}'\{{chr}}'{% endmacro %}


--------------------------------------------------------------------------------
/macros/db/postgres/quote_string.sql:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | {%- macro postgres__quote_new_line() %}'\\n'{% endmacro %}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_part0.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_part() }}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_part1.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_part() }}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_part2.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_part() }}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_part3.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_part() }}


--------------------------------------------------------------------------------
/static/lineage_graph.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/re-data/dbt-re-data/HEAD/static/lineage_graph.png


--------------------------------------------------------------------------------
/macros/db/redshift/escape.sql:
--------------------------------------------------------------------------------
1 | 
2 | {% macro redshift__escape_seq_for_json(chr) %}'\\\{{chr}}'{% endmacro %}


--------------------------------------------------------------------------------
/macros/db/redshift/quote_string.sql:
--------------------------------------------------------------------------------
1 | 
2 | {%- macro redshift__quote_new_line() %}'\134\134n'{% endmacro %}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_thread0.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_thread(0)}}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_thread1.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_thread(1)}}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_thread2.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_thread(2)}}


--------------------------------------------------------------------------------
/models/internal/metrics/base/re_data_last_base_metrics_thread3.sql:
--------------------------------------------------------------------------------
1 | {{ re_data_last_base_metrics_thread(3)}}


--------------------------------------------------------------------------------
/packages.yml:
--------------------------------------------------------------------------------
1 | 
2 | packages:
3 |   - package: dbt-labs/dbt_utils
4 |     version: [">=1.0.0", "<1.2.0"]
5 | 


--------------------------------------------------------------------------------
/integration_tests/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | target/
3 | dbt_modules/
4 | dbt_packages/
5 | logs/
6 | .user.yml
7 | package-lock.yml
8 | 


--------------------------------------------------------------------------------
/macros/db/bigquery/quote_string.sql:
--------------------------------------------------------------------------------
1 | 
2 | {%- macro bigquery__quote_string(str) %}
3 |     r"""{{ str }}"""
4 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/db/postgres/postgres_type_db.sql:
--------------------------------------------------------------------------------
1 | {% macro postgres_type_db() %}
2 |     {{ ('postgres', 'greenplum') }}
3 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - local: ../
3 | 
4 |   - package: dbt-labs/dbt_utils
5 |     version: [">=1.0.0", "<1.2.0"]
6 | 


--------------------------------------------------------------------------------
/macros/public/cleaning/clean_capitalize_words.sql:
--------------------------------------------------------------------------------
1 | {% macro clean_capitalize_words(column_name) %}
2 |     initcap( {{column_name}} )
3 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/for_loops.sql:
--------------------------------------------------------------------------------
1 | 
2 | {% macro print_list(l) %}
3 |     {% for el in l %}{{el}}{% if not loop.last %},{% endif %}{% endfor %}
4 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/filtering/duplicates.sql:
--------------------------------------------------------------------------------
1 |  {{ re_data.filter_get_duplicates(
2 |         ref('duplicated'), ['transaction_id'], ['creation_time']) }}
3 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/run_all_dbs.sh:
--------------------------------------------------------------------------------
1 | pytest --db postgres $@ & 
2 | pytest --db snowflake $@ &
3 | pytest --db bigquery $@ &
4 | pytest --db redshift $@ &
5 | wait 
6 | 


--------------------------------------------------------------------------------
/models/internal/samples/re_data_last_table_samples_part.sql:
--------------------------------------------------------------------------------
1 | {{
2 |     config(
3 |         materialized='table',
4 |     )
5 | }}
6 | 
7 | {{ re_data.empty_last_table_samples() }}


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | *.pyc
 2 | *.egg-info
 3 | .DS_Store
 4 | .vscode
 5 | .python-version
 6 | venv
 7 | .idea/
 8 | logs/*
 9 | dbt_modules/*
10 | dbt_packages/*
11 | target/*
12 | .env
13 | 


--------------------------------------------------------------------------------
/models/internal/re_data_run_started_at.sql:
--------------------------------------------------------------------------------
1 | {{
2 |     config(
3 |         materialized='table',
4 |     )
5 | }}
6 | 
7 | select {{ run_started_at.timestamp() * 1000000 }} as run_started_at


--------------------------------------------------------------------------------
/.github/pull_request_template.md:
--------------------------------------------------------------------------------
1 | ## What
2 | *Describe what the change is solving*
3 | *It helps to add screenshots if it affects the frontend.*
4 | 
5 | ## How
6 | *Describe the solution*
7 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/validate_emails.csv:
--------------------------------------------------------------------------------
1 | user_id,email
2 | 1,test@fakemail.com
3 | 2,novalidemail@
4 | 3,novalidemail@com
5 | 4,test+alovalidemail@fakemail.com
6 | 


--------------------------------------------------------------------------------
/macros/utils/depends_macro.sql:
--------------------------------------------------------------------------------
1 | {% macro generate_depends(used_tables) %}
2 |     {% for t in used_tables %}
3 |         -- depends_on: {{ ref(t) }}
4 |     {% endfor %}
5 | 
6 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/expected_table_samples.csv:
--------------------------------------------------------------------------------
1 | table_name,sample_data_length
2 | BUY_EVENTS,506
3 | RE_DATA_SOURCE_TEST_TABLE,361
4 | SAMPLE_TABLE,830
5 | SAMPLE_WITH_ANOMALY,507
6 | 


--------------------------------------------------------------------------------
/models/meta/re_data_selected.sql:
--------------------------------------------------------------------------------
1 | 
2 | select 
3 |     name, schema, database, time_filter, metrics, columns, anomaly_detector, owners
4 | from {{ ref('re_data_monitored')}}
5 | where 
6 |     selected = true


--------------------------------------------------------------------------------
/macros/db/bigquery/quote_column_name.sql:
--------------------------------------------------------------------------------
1 | {% macro bigquery__quote_column_name(column_name) %}
2 |     {% set quoted_col_name = '`' + column_name + '`' %}
3 |     {{ return(quoted_col_name) }}
4 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/comma_delimited_list.sql:
--------------------------------------------------------------------------------
1 | {% macro comma_delimited_list(args) %}
2 |     {%- for arg in args %}
3 |         {{- arg -}} {{- ", " if not loop.last else "" -}}
4 |     {% endfor %}
5 | {% endmacro %}
6 | 


--------------------------------------------------------------------------------
/macros/utils/comparison_text.sql:
--------------------------------------------------------------------------------
1 | {% macro comparison_text(a, b) %}
2 |     case when {{a}} > {{b}} then 'greater than' 
3 |     when {{a}} = {{b}} then 'equal to'
4 |     else 'less than' end
5 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/pytest.ini:
--------------------------------------------------------------------------------
1 | [pytest]
2 | norecursedirs = dbt_modules models target logs data analysis macros snapshots tests
3 | python_files = test_*.py
4 | python_functions = test*
5 | addopts = --capture=no --durations=0


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/expected_validated_emails.csv:
--------------------------------------------------------------------------------
1 | user_id,email,email_valid
2 | 1,test@fakemail.com,1
3 | 2,novalidemail@,0
4 | 3,novalidemail@com,0
5 | 4,test+alovalidemail@fakemail.com,0
6 | 


--------------------------------------------------------------------------------
/macros/db/core/escape.sql:
--------------------------------------------------------------------------------
1 | 
2 | {% macro escape_seq_for_json(chr) %}{{adapter.dispatch('escape_seq_for_json', 're_data')(chr)}}{% endmacro %}
3 | 
4 | {% macro default__escape_seq_for_json(chr) %}'\\\{{chr}}'{% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/cleaning/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | models:
4 |   - name: sanitized_user_data
5 |     tests:
6 |       - dbt_utils.equality:
7 |           compare_model: ref('expected_sample_user_data')


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/normalizing/schema.yml:
--------------------------------------------------------------------------------
1 | version: 2
2 | 
3 | models:
4 |   - name: us_states_normalized
5 |     tests:
6 |       - dbt_utils.equality:
7 |           compare_model: ref('expected_us_states_normalized')


--------------------------------------------------------------------------------
/macros/db/snowflake/identifier_mapping.sql:
--------------------------------------------------------------------------------
1 | 
2 | {% macro snowflake__name_in_db(name) %}
3 |     {% if name %}
4 |         {{ return (name.upper()) }}
5 |     {% else %}
6 |         {{ return (name) }}
7 |     {% endif %}
8 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/bool_to_string.sql:
--------------------------------------------------------------------------------
1 | 
2 | {% macro bool_to_string(column) %}
3 |     (
4 |     case when {{ column }} = true then 'true'
5 |          when {{ column }} = false then 'false'
6 |     end
7 |     ) as {{ column }}
8 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/validate_numbers.csv:
--------------------------------------------------------------------------------
 1 | number
 2 | "133"
 3 | "1232.232"
 4 | "2332,123"
 5 | "not a number"
 6 | "1,3%"
 7 | "123%"
 8 | "13  %"
 9 | "76.234%"
10 | "not a number"
11 | "x"
12 | "123partly987"
13 | 


--------------------------------------------------------------------------------
/macros/utils/is_list.sql:
--------------------------------------------------------------------------------
1 | {% macro is_list(obj) %}
2 |     {% if not obj %}
3 |         {{ return (False) }}
4 |     {% endif %}
5 |     {% set check = obj is iterable and (obj is not string and obj is not mapping) %}
6 |     {{ return (check) }}
7 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/db/bigquery/split_and_return_nth_value.sql:
--------------------------------------------------------------------------------
1 | {% macro bigquery__split_and_return_nth_value(column_name, delimiter, ordinal) %}
2 |     split({{ re_data.clean_blacklist(column_name, ['"', '`'], '') }}, '{{ delimiter }}')[ORDINAL( {{ ordinal }} )]
3 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/validate_date_and_time.csv:
--------------------------------------------------------------------------------
 1 | date_time
 2 | 31-01-2020
 3 | 01/31/2020
 4 | 05.05.2020
 5 | 2020-01-31
 6 | 23:59
 7 | 12:59
 8 | 13:59:01
 9 | "12:59:01,55"
10 | 11:59:00
11 | midnight
12 | 2020-01-31T12:59:00+02:00
13 | 2020-01-31T12:59:00


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/validating/validated_emails.sql:
--------------------------------------------------------------------------------
1 | with 
2 |     all_emails as (
3 |         select * from {{ ref('validate_emails') }}
4 |     )
5 | 
6 | select *, case when {{ re_data.valid_email('email') }} then 1 else 0 end as email_valid
7 |     from all_emails 
8 | 


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/validating/validated_uuids.sql:
--------------------------------------------------------------------------------
1 | with 
2 |     all_rows as (
3 |         select * from {{ ref('validate_uuid') }}
4 |     )
5 | 
6 | select *, 
7 |     case when {{ re_data.valid_uuid('uuid') }} then 1 else 0 end as valid_uuid
8 |     from all_rows 
9 | 


--------------------------------------------------------------------------------
/integration_tests/models/monitoring/test_re_data_anomalies.sql:
--------------------------------------------------------------------------------
1 | 
2 | select
3 |     {{ clean_table_name('table_name') }} as table_name,
4 |     {{ clean_column_name('column_name') }} as column_name,
5 |     metric,
6 |     anomaly_detector,
7 |     interval_length_sec
8 | 
9 | from {{ ref('re_data_anomalies') }}


--------------------------------------------------------------------------------
/macros/db/core/identifier_mapping.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro name_in_db(name) %}
 3 |     {% set translated = adapter.dispatch('name_in_db', 're_data')(name) %}
 4 |     {{ return(translated) }}
 5 |     
 6 | {% endmacro %}
 7 | 
 8 | {% macro default__name_in_db(name) %}
 9 |     {{ return(name) }}
10 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/cleaning/sanitized_user_data.sql:
--------------------------------------------------------------------------------
1 | select
2 |     {{ re_data.clean_capitalize_words(re_data.clean_additional_whitespaces('full_name')) }} as full_name,
3 |     {{ re_data.clean_blacklist('email', ['^[a-zA-Z0-9_.+-]+'], '*****') }} as email
4 | from {{ ref('sample_user_data') }}
5 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/validate_uuid.csv:
--------------------------------------------------------------------------------
1 | uuid
2 | ace1245c-3af5-11ec-8d3d-0242ac130003
3 | a568464e-a05d-412c-8b30-517a46c57d88
4 | notanuid
5 | d0d61836-3af5-11ec-8d3d-0242ac130003
6 | d0d61c6e-3af5-11ec-8d3d-0242ac130003
7 | 343422-234324-234234-4234234-23432
8 | 343422-234324-234234-4234234-234xxx32


--------------------------------------------------------------------------------
/macros/utils/formulas.sql:
--------------------------------------------------------------------------------
 1 | {% macro percentage_formula(summation, total) %}
 2 |     abs(
 3 |         ( 
 4 |             cast({{ summation }} as {{ numeric_type() }})
 5 |         ) / 
 6 |         nullif(
 7 |             cast( {{ total }} as {{ numeric_type() }} )
 8 |         , 0) * 100.0
 9 |     )
10 | {% endmacro %}


--------------------------------------------------------------------------------
/models/metrics/for_anomalies/re_data_last_metrics.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     table_name,
 3 |     column_name,
 4 |     metric,
 5 |     value as last_value,
 6 |     interval_length_sec,
 7 |     computed_on
 8 | from 
 9 |     {{ ref('re_data_base_metrics') }}
10 | where
11 |     time_window_end = {{- time_window_end() -}}
12 | 
13 | 


--------------------------------------------------------------------------------
/macros/meta/monitored_model_queries.sql:
--------------------------------------------------------------------------------
 1 | {% macro get_tables() %}
 2 |     select *
 3 |     from {{ ref('re_data_selected') }}
 4 |     order by name, schema, database, time_filter
 5 | {% endmacro %}
 6 | 
 7 | {% macro get_schemas() %}
 8 |     select distinct schema, database
 9 |     from {{ ref('re_data_selected') }}
10 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/filtering/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: deduplicated
 5 |     tests:
 6 |       - dbt_utils.equality:
 7 |           compare_model: ref('expected_deduplicated')
 8 |   - name: duplicates
 9 |     tests:
10 |       - dbt_utils.equality:
11 |           compare_model: ref('expected_duplicates')


--------------------------------------------------------------------------------
/macros/metrics/base/build_in/optional_table_metrics.sql:
--------------------------------------------------------------------------------
1 | {% macro re_data_metric_distinct_table_rows(context) %}
2 |     with temp_table AS (
3 |             select distinct * from {{ context.table_name }}
4 |             where {{ in_time_window(context.time_filter) }}
5 |         )
6 |     select coalesce(count(*), 0) FROM temp_table
7 | {% endmacro %}
8 | 


--------------------------------------------------------------------------------
/macros/utils/quote.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {% macro quote_column(col_name) %}
 4 |     {{ adapter.dispatch('quote_column', 're_data')(col_name) }}
 5 | {% endmacro %}
 6 | 
 7 | {% macro default__quote_column(col_name) %}
 8 |     "{{ col_name }}"
 9 | {% endmacro %}
10 | 
11 | {% macro bigquery__quote_column(col_name) %}
12 |     `{{ col_name }}`
13 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/expected_validated_uuids.csv:
--------------------------------------------------------------------------------
1 | uuid,valid_uuid
2 | ace1245c-3af5-11ec-8d3d-0242ac130003,1
3 | a568464e-a05d-412c-8b30-517a46c57d88,1
4 | notanuid,0
5 | d0d61836-3af5-11ec-8d3d-0242ac130003,1
6 | d0d61c6e-3af5-11ec-8d3d-0242ac130003,1
7 | 343422-234324-234234-4234234-23432,0
8 | 343422-234324-234234-4234234-234xxx32,0
9 | 


--------------------------------------------------------------------------------
/models/alerts/re_data_test_runs.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='view'
 4 |     )
 5 | }}
 6 | 
 7 | select 
 8 |     sum(case when status = 'Fail' then 1 else 0 end) as failed,
 9 |     sum(case when status = 'Pass' then 1 else 0 end) as passed,
10 |     run_at
11 | from {{ ref ('re_data_test_history') }}
12 | group by run_at
13 | order by run_at desc


--------------------------------------------------------------------------------
/integration_tests/models/monitoring/test_re_data_metrics.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | select
 3 |     {{ clean_table_name('table_name') }} as table_name,
 4 |     {{ clean_column_name('column_name') }} as column_name,
 5 |     metric,
 6 |     time_window_start,
 7 |     time_window_end,
 8 |     {{ to_big_integer('value') }},
 9 |     interval_length_sec
10 | 
11 | from {{ ref('re_data_metrics') }}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/expected_validated_credit_cards.csv:
--------------------------------------------------------------------------------
 1 | credit_card_number,valid_credit_card
 2 | 4941533405630082,1
 3 | 5476749195896614,1
 4 | 3568497486294461,1
 5 | not_a_card_number,0
 6 | 3434-4351-4234-3234,0
 7 | 3434 4351 4234 3234,0
 8 | 34344 42344 43455 43456,0
 9 | 43423432,0
10 | 234343443434,0
11 | 2343423423423423423423423423,0
12 | 


--------------------------------------------------------------------------------
/macros/utils/dict_from_list.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro dict_from_list(el_list) %}
 3 | 
 4 |     {% if el_list is none %}
 5 |         {{ return (none) }}
 6 |     {% endif %}
 7 | 
 8 |     {% set for_cols_dict = {} %}
 9 |     {% for col in el_list %}
10 |         {% do for_cols_dict.update({col: True})%}
11 |     {% endfor %}
12 |     {% do return(for_cols_dict) %}
13 | 
14 | {% endmacro %}


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/documentation-request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Documentation request
 3 | about: Describe this issue template's purpose here.
 4 | title: "[DOCUMENTATION]"
 5 | labels: documentation
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Tell us about the documentation you'd like us to add or update**
11 | 
12 | **Is anything not clear or outdates in the current version of docs?**
13 | 


--------------------------------------------------------------------------------
/integration_tests/models/transformed/buy_events.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         re_data_monitored=true,
 4 |         re_data_time_filter='creation_time',
 5 |         re_data_anomaly_detector={'name': 'z_score', 'threshold': 0.5},
 6 |         materialized='table',
 7 |         tags=['testtag']
 8 |     )
 9 | }}
10 | select *
11 | from {{ ref('sample_with_anomaly') }}
12 | where event_type = 'buy'


--------------------------------------------------------------------------------
/macros/db/core/quote_column_name.sql:
--------------------------------------------------------------------------------
 1 | {% macro quote_column_name(column_name) %}
 2 |     {% set col_name = adapter.dispatch('quote_column_name', 're_data')(column_name) %}
 3 |     {{ return(col_name) }}
 4 | {% endmacro %}
 5 | 
 6 | 
 7 | {% macro default__quote_column_name(column_name) %}
 8 |     {% set quoted_col_name = '"' + column_name + '"' %}
 9 |     {{ return(quoted_col_name) }}
10 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/filtering/duplicated.csv:
--------------------------------------------------------------------------------
 1 | transaction_id,creation_time,status,value
 2 | 1,2021-05-01 12:31:32,pending,100
 3 | 2,2021-05-01 12:35:35,pending,200
 4 | 1,2021-05-01 12:40:35,completed,100
 5 | 3,2021-05-01 12:40:35,pending,300
 6 | 3,2021-05-02 12:31:32,completed,300
 7 | 4,2021-05-02 12:35:35,completed,10
 8 | 5,2021-05-02 12:40:35,pending,100
 9 | 4,2021-05-02 12:40:35,completed,40
10 | 


--------------------------------------------------------------------------------
/macros/db/core/quote_string.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {%- macro quote_string(str) %}
 3 |     {{ adapter.dispatch('quote_string', 're_data')(str) }}
 4 | {% endmacro %}
 5 | 
 6 | {%- macro default__quote_string(str) %}
 7 |     $${{ str }}$$
 8 | {% endmacro %}
 9 | 
10 | 
11 | {%- macro quote_new_line() %}{{ adapter.dispatch('quote_new_line', 're_data')() }}{% endmacro %}
12 | 
13 | {%- macro default__quote_new_line() %}'\134\134n'{% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/monitoring/test_re_data_test_history.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | select
 3 |     {{ clean_table_name('table_name') }} as table_name,
 4 |     {{ clean_column_name('column_name') }} as column_name,
 5 |     right(test_name, 15) as test_name,
 6 |     status,
 7 |     {{ clean_column_name('message') }} as message,
 8 |     cast (failures_count as integer) as failures_count,
 9 |     severity
10 | from {{ ref('re_data_test_history') }}


--------------------------------------------------------------------------------
/integration_tests/python_tests/conftest.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | def pytest_addoption(parser):
 4 |     parser.addoption("--db", action="store")
 5 |     parser.addoption("--source_schema", action="store")
 6 |     
 7 | 
 8 | @pytest.fixture()
 9 | def db(pytestconfig):
10 |     return pytestconfig.getoption("db")
11 | 
12 | @pytest.fixture()
13 | def source_schema(pytestconfig):
14 |     return pytestconfig.getoption("source_schema")


--------------------------------------------------------------------------------
/macros/db/core/split_and_return_nth_value.sql:
--------------------------------------------------------------------------------
1 | {% macro split_and_return_nth_value(column_name, delimiter, ordinal) -%}
2 |     {{ adapter.dispatch('split_and_return_nth_value', 're_data')(column_name, delimiter, ordinal) }}
3 | {%- endmacro %}
4 | 
5 | {% macro default__split_and_return_nth_value(column_name, delimiter, ordinal) -%}
6 |     split_part({{ re_data.clean_blacklist(column_name, ['"', '`'], '') }}, '{{ delimiter }}', {{ ordinal }})
7 | {%- endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/monitoring/test_re_data_table_samples.sql:
--------------------------------------------------------------------------------
1 | 
2 | select 
3 |     {{ clean_table_name('table_name') }} as table_name,
4 |     length(sample_data) as sample_data_length
5 | from {{ ref('re_data_table_samples') }}
6 | where {{ clean_table_name('table_name') }} != 'SAMPLE_WITHOUT_TIME_FILTER'
7 | 
8 | -- SAMPLE_WITHOUT_TIME_FILTER because this table doesn't have a time filter, it's not possible to say how
9 | -- exactly the sampel of it should look like.


--------------------------------------------------------------------------------
/macros/post_hook/re_data_monitored.sql:
--------------------------------------------------------------------------------
 1 | {% macro pub_insert_into_re_data_monitored() %}
 2 |     {% set monitored = re_data.pub_monitored_from_graph() %}
 3 |     {% do re_data.insert_list_to_table(
 4 |         this,
 5 |         monitored,
 6 |         ['name', 'schema', 'database', 'time_filter', 'metrics_groups', 'additional_metrics', 'metrics', 'columns', 'anomaly_detector', 'owners', 'selected']
 7 |     ) %}
 8 | 
 9 |     {{ return('') }}
10 | 
11 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/validating/validated_ips.sql:
--------------------------------------------------------------------------------
 1 | with 
 2 |     all_rows as (
 3 |         select * from {{ ref('validate_ip') }}
 4 |     )
 5 | 
 6 | select *, 
 7 |     case when {{ re_data.valid_ip_v4('ip_address') }} then 1 else 0 end as valid_ip_v4,
 8 |     case when {{ re_data.valid_ip_v6('ip_address') }} then 1 else 0 end as valid_ip_v6,
 9 |     case when {{ re_data.valid_ip('ip_address') }} then 1 else 0 end as valid_ip
10 |     from all_rows 
11 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/filtering/expected_duplicates.csv:
--------------------------------------------------------------------------------
1 | "transaction_id","creation_time","status","value","re_data_duplicate_group_row_count","re_data_duplicate_group_row_number"
2 | 1,"2021-05-01 12:31:32","pending",100,2,1
3 | 1,"2021-05-01 12:40:35","completed",100,2,2
4 | 3,"2021-05-01 12:40:35","pending",300,2,1
5 | 3,"2021-05-02 12:31:32","completed",300,2,2
6 | 4,"2021-05-02 12:35:35","completed",10,2,1
7 | 4,"2021-05-02 12:40:35","completed",40,2,2
8 | 


--------------------------------------------------------------------------------
/integration_tests/models/sources/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | sources:
 4 |   - name:  "{{ var('source_schema', target.schema) }}"
 5 |     tables:
 6 |       - name: re_data_source_test_table
 7 |         columns:
 8 |           - name: number
 9 |             tests:
10 |               - not_null
11 |               - unique
12 |           
13 |           - name: description
14 |             tests:
15 |               - not_null
16 |               - unique
17 |           
18 |       


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/expected_validated_numbers.csv:
--------------------------------------------------------------------------------
 1 | number,is_number,is_number_decimal_point,is_number_decimal_comma,is_percentage,is_percentage_decimal_point,is_percentage_decimal_comma
 2 | 133,1,0,0,0,0,0
 3 | 1232.232,0,1,0,0,0,0
 4 | "2332,123",0,0,1,0,0,0
 5 | not a number,0,0,0,0,0,0
 6 | "1,3%",0,0,0,1,0,1
 7 | 123%,0,0,0,1,1,1
 8 | 13  %,0,0,0,0,0,0
 9 | 76.234%,0,0,0,1,1,0
10 | not a number,0,0,0,0,0,0
11 | x,0,0,0,0,0,0
12 | 123partly987,0,0,0,0,0,0
13 | 


--------------------------------------------------------------------------------
/macros/utils/get_database.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro get_target_database() %}
 3 |     {{- adapter.dispatch('get_target_database', 're_data')() -}}
 4 | {% endmacro %}
 5 | 
 6 | {% macro default__get_target_database() %}
 7 |     {{- return (target.dbname) -}}
 8 | {% endmacro %}
 9 | 
10 | {% macro bigquery__get_target_database() %}
11 |     {{- return (target.project) -}}
12 | {% endmacro %}
13 | 
14 | {% macro snowflake__get_target_database() %}
15 |     {{- return (target.database) -}}
16 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/sample_table.csv:
--------------------------------------------------------------------------------
 1 | creation_time,update_time,event_type,value1,value2,null_value,not_used_colum
 2 | 2021-04-30 12:40:35,,buy,100,109,,1
 3 | 2021-05-01 12:31:32,,buy,100,200,,1
 4 | 2021-05-01 12:35:35,,buy,110,205,,1
 5 | 2021-05-01 12:40:35,,sell,200,209,,1
 6 | 2021-05-01 12:40:35,2021-05-01 12:40:37,buy,100,109,,1
 7 | 2021-05-02 12:31:32,,buy,110,200,,1
 8 | 2021-05-02 12:35:35,,buy,150,205,,1
 9 | 2021-05-02 12:40:35,,buy,210,209,,1
10 | 2021-05-02 12:40:35,,buy,100,109,,1
11 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/sample_with_anomaly.csv:
--------------------------------------------------------------------------------
 1 | creation_time,event_type,value1,value2
 2 | 2021-04-30 12:40:35,buy,101,109
 3 | 2021-05-01 12:31:32,buy,107,200
 4 | 2021-05-02 12:35:35,buy,98,205
 5 | 2021-05-03 12:40:35,sell,108,209
 6 | 2021-05-04 12:40:35,buy,100,109
 7 | 2021-05-05 12:31:32,buy,110,200
 8 | 2021-05-06 12:35:35,buy,99,205
 9 | 2021-05-07 12:40:35,buy,94,209
10 | 2021-05-08 12:40:35,buy,104,109
11 | 2021-05-09 12:31:32,buy,10,200
12 | 2021-05-10 12:35:35,buy,23,205
13 | 2021-05-11 12:40:35,sell,10,209
14 | 


--------------------------------------------------------------------------------
/macros/public/filtering/remove_duplicates.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | {% macro filter_remove_duplicates(relation, unique_cols, sort_columns) %}
 5 |     (
 6 |         with with_row_num as (
 7 |             {{re_data.add_duplication_context(relation, unique_cols, sort_columns)}}
 8 |         ),
 9 |         one_row_num as (
10 |             select * from with_row_num where re_data_duplicate_group_row_number = 1
11 |         )
12 |         select {{ dbt_utils.star(from=relation) }}
13 |         from one_row_num
14 |     ) 
15 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/macros/trigger_schema_change.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro schema_change_buy_events_add_column() %}
 3 |     {% set alter_table %}
 4 |         alter table {{ ref('buy_events')}} add column sample_column boolean
 5 |     {% endset %}
 6 |     {% do run_query(alter_table) %}
 7 | {% endmacro %}
 8 | 
 9 | 
10 | {% macro schema_change_buy_events_drop_column() %}
11 |     {% set alter_table %}
12 |         alter table {{ ref('buy_events')}} drop column sample_column
13 |     {% endset %}
14 |     {% do run_query(alter_table) %}
15 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/in_compile.sql:
--------------------------------------------------------------------------------
 1 | {% macro in_compile() %}
 2 | 
 3 |     {%- call statement('in_compile', fetch_result=True) -%}
 4 |         select * from {{ ref('re_data_run_started_at') }}
 5 |     {%- endcall -%}
 6 | 
 7 |     {% if execute %}
 8 |         {%- set result = load_result('in_compile')['data'][0][0] -%}
 9 |         {% if result == run_started_at.timestamp() * 1000000 %}
10 |             {{ return(False) }}
11 |         {% else %}
12 |             {{ return(True) }}
13 |         {% endif %}
14 |     {% endif %}
15 | 
16 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/python_tests/test_cleaners.py:
--------------------------------------------------------------------------------
 1 | from .utils.run import dbt_seed, dbt_run, dbt_test
 2 | 
 3 | def test_cleaners(db, source_schema, debug=True):
 4 |     dbt_vars = {
 5 |         'source_schema': source_schema
 6 |     }
 7 | 
 8 |     print (f"Running setup and tests for {db}")
 9 | 
10 |     dbt_seed(f'--select public_macros.cleaning', db, dbt_vars)
11 |     dbt_run(f'--select sanitized_user_data+', db, dbt_vars)
12 |     dbt_test(f'--select sanitized_user_data', db, dbt_vars)
13 | 
14 |     print (f"Running tests completed for {db}")
15 | 


--------------------------------------------------------------------------------
/models/metrics/types/samples/re_data_table_samples.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table',
 4 |         unique_key = 'table_name',
 5 |         on_schema_change='sync_all_columns',
 6 |     )
 7 | }}
 8 | 
 9 | -- depends_on: {{ ref('re_data_last_table_samples') }}
10 | -- depends_on: {{ ref('re_data_last_table_samples_part') }}
11 | 
12 | select
13 |     table_name,
14 |     sample_data,
15 |     cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as sampled_on
16 | 
17 | from {{ ref('re_data_last_table_samples_part') }}
18 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/test_normalizers.py:
--------------------------------------------------------------------------------
 1 | from .utils.run import dbt_seed, dbt_run, dbt_test
 2 | 
 3 | def test_normalizers(db, source_schema, debug=True):
 4 |     dbt_vars = {
 5 |         'source_schema': source_schema
 6 |     }
 7 | 
 8 |     print (f"Running setup and tests for {db}")
 9 | 
10 |     dbt_seed(f'--select public_macros.normalizing', db, dbt_vars)
11 |     dbt_run(f'--select us_states_normalized+', db, dbt_vars)
12 |     dbt_test(f'--models us_states_normalized', db, dbt_vars)
13 | 
14 |     print (f"Running tests completed for {db}")
15 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/expected_validated_date_and_time.csv:
--------------------------------------------------------------------------------
 1 | date_time,valid_date_eu,valid_date_us,valid_date_inverse,valid_date_iso_8601,valid_time_24h,valid_time_12h,valid_time
 2 | 31-01-2020,1,0,0,0,0,0,0
 3 | 01/31/2020,0,1,0,0,0,0,0
 4 | 05.05.2020,1,1,0,0,0,0,0
 5 | 2020-01-31,0,0,1,0,0,0,0
 6 | 23:59,0,0,0,0,1,0,1
 7 | 12:59,0,0,0,0,1,1,1
 8 | 13:59:01,0,0,0,0,0,0,1
 9 | "12:59:01,55",0,0,0,0,0,0,1
10 | 11:59:00,0,0,0,0,0,0,1
11 | midnight,0,0,0,0,0,0,0
12 | 2020-01-31T12:59:00+02:00,0,0,0,1,0,0,0
13 | 2020-01-31T12:59:00,0,0,0,1,0,0,0
14 | 


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Bug report
 3 | about: Create a report to help us improve
 4 | title: "[BUG]"
 5 | labels: bug
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Describe the bug**
11 | Description of what the bug is.
12 | 
13 | **Expected behavior**
14 | Description of what you expected to happen.
15 | 
16 | **To Reproduce**
17 | Steps to reproduce the behavior:
18 | 
19 | **Screenshots**
20 | If applicable, add screenshots to help explain your problem.
21 | 
22 | **Logs and additional context**
23 | If application, any other context, logs etc.here
24 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/test_validate.py:
--------------------------------------------------------------------------------
 1 | from .utils.run import dbt_seed, dbt_run, dbt_test
 2 | 
 3 | def test_validate_regex(db, source_schema, debug=True):
 4 |     dbt_vars = {
 5 |         'source_schema': source_schema
 6 |     }
 7 | 
 8 |     print (f"Running setup and tests for {db}")
 9 | 
10 |     dbt_seed(
11 |         f'--select public_macros.validating', db, dbt_vars
12 |     )
13 | 
14 |     dbt_run(f'--select public_macros.validating', db, dbt_vars)
15 |     dbt_test(f'--select public_macros.validating', db, dbt_vars)
16 | 
17 |     print (f"Running tests completed for {db}")


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/sample_without_time_filter.csv:
--------------------------------------------------------------------------------
 1 | title,rental_rate,rating
 2 | Academy Dinosaur,0.99,PG-13
 3 | Alamo Videotape,0.99,G
 4 | Affair Prejudice,2.99,G
 5 | African Egg,2.99,G
 6 | Ace Goldfinger,4.99,G
 7 | Alice Fantasia,0.99,NC-17
 8 | Adaptation Holes,2.99,NC-17
 9 | Alien Center,2.99,NC-17
10 | Aladdin Calendar,4.99,NC-17
11 | Chamber Italian,4.99,NC-17
12 | Alaska Phantom,0.99,PG
13 | Agent Truman,2.99,PG
14 | Ali Forever,4.99,PG
15 | Alabama Devil,2.99,PG-13
16 | Bright Encounters,4.99,PG-13
17 | Airplane Sierra,4.99,PG-13
18 | Date Speed,0.99,R
19 | Grosse Wonderful,4.99,R
20 | Airport Pollock,4.99,R


--------------------------------------------------------------------------------
/macros/public/store/export_table_samples.sql:
--------------------------------------------------------------------------------
 1 | {% macro export_table_samples(start_date, end_date, table_samples_path=None) %}
 2 |     {% set table_samples_query %}
 3 |         select
 4 |             lower(table_name) as table_name,
 5 |             sample_data,
 6 |             sampled_on
 7 |         from
 8 |             {{ ref('re_data_table_samples') }}
 9 |     {% endset %}
10 | 
11 |     {% set query_result = run_query(table_samples_query) %}
12 |     {% set table_samples_file_path = table_samples_path or 'target/re_data/table_samples.json' %}
13 |     {% do query_result.to_json(table_samples_file_path) %}
14 | 
15 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/macros/test_utils.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro to_big_integer(field) %}
 3 |     cast (round({{field}} * 1000) as integer) as {{field}}
 4 | {% endmacro %}
 5 | 
 6 | {% macro clean_table_name(field) %}
 7 |     upper(
 8 |         {{-
 9 |             re_data.clean_blacklist(
10 |                 re_data.split_and_return_nth_value(field, '.', 3),
11 |                 ['"', '`'],
12 |                 ''
13 |             )
14 |         -}}
15 |     )
16 | {% endmacro %}
17 | 
18 | {% macro clean_column_name(field) %}
19 |     case when ({{ field }} = '' or  {{ field }} is null ) then '---' else  upper({{field}}) end
20 | {% endmacro %}
21 | 


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/filtering/deduplicated.sql:
--------------------------------------------------------------------------------
 1 | with x as 
 2 |     {{ re_data.filter_remove_duplicates(
 3 |         ref('duplicated'), ['transaction_id'], ['creation_time']) }}
 4 | 
 5 | select *, 'take_first' as use_case from x
 6 | 
 7 | union all
 8 | 
 9 | select *, 'take_last' as use_case from {{ re_data.filter_remove_duplicates(
10 |         ref('duplicated'), ['transaction_id'], ['creation_time desc']) }} duplicates
11 | 
12 | 
13 | union all
14 | 
15 | select *, 'take_all_statuses' as use_case from {{ re_data.filter_remove_duplicates(
16 |     ref('duplicated'), ['transaction_id', 'status'], ['creation_time desc']) }} duplicates


--------------------------------------------------------------------------------
/macros/utils/regular_expression.sql:
--------------------------------------------------------------------------------
 1 | {% macro regex_match_expression(column_name, pattern) %}
 2 |     {{ adapter.dispatch('regex_match_expression', 're_data')(column_name, pattern) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro default__regex_match_expression(column_name, pattern) %}
 6 |     ({{column_name}} ~ '{{pattern}}')
 7 | {% endmacro %}
 8 | 
 9 | {% macro bigquery__regex_match_expression(column_name, pattern) %}
10 |     regexp_contains({{column_name}}, r'{{pattern}}')
11 | {% endmacro %}
12 | 
13 | {% macro snowflake__regex_match_expression(column_name, pattern) %}
14 |     regexp_like({{column_name | upper}}, '{{pattern}}')
15 | {% endmacro %}
16 | 


--------------------------------------------------------------------------------
/integration_tests/models/monitoring/test_re_data_z_score.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% set values_compare = [
 3 |     'z_score_value',
 4 |     'modified_z_score_value', 
 5 |     'last_value',
 6 |     'last_avg',
 7 |     'last_stddev',
 8 |     'last_median',
 9 |     'last_iqr',
10 |     'last_median_absolute_deviation',
11 |     'last_mean_absolute_deviation',
12 | ] %}
13 | 
14 | select
15 |     {{ clean_table_name('table_name') }} as table_name,
16 |     {{ clean_column_name('column_name') }} as column_name,
17 |     metric,
18 |     time_window_end,
19 |     {% for col in values_compare %}{{ to_big_integer(col) }},{% endfor %}
20 |     interval_length_sec
21 | 
22 | from {{ ref('re_data_z_score') }}


--------------------------------------------------------------------------------
/macros/utils/deduplication/add_duplication_context.sql:
--------------------------------------------------------------------------------
 1 | {% macro add_duplication_context(relation, unique_cols, sort_columns) %}
 2 | 
 3 |             select {{ dbt_utils.star(from=relation) }}
 4 |             , count(*) over (
 5 |                  partition by {{ re_data.comma_delimited_list(unique_cols) }} 
 6 |             ) as re_data_duplicate_group_row_count
 7 |             , row_number() over (
 8 |                 partition by {{ re_data.comma_delimited_list(unique_cols) }} {% if sort_columns %} order by {{ re_data.comma_delimited_list(sort_columns) }} {% endif %}
 9 |             ) as re_data_duplicate_group_row_number
10 | 
11 |             from {{ relation }}
12 | 
13 | {% endmacro %}


--------------------------------------------------------------------------------
/models/logs/re_data_test_history.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         on_schema_change='sync_all_columns',
 5 |     )
 6 | }}
 7 | 
 8 | {{
 9 |     re_data.empty_table_generic([
10 |         ('table_name', 'string'),
11 |         ('column_name', 'string'),
12 |         ('test_name', 'string'),
13 |         ('status', 'string'),
14 |         ('execution_time', 'numeric'),
15 |         ('message', 'string'),
16 |         ('failures_count', 'numeric'),
17 |         ('failures_json', 'long_string'),
18 |         ('failures_table', 'long_string'),
19 |         ('severity', 'string'),
20 |         ('compiled_sql', 'long_string'),
21 |         ('run_at', 'timestamp')
22 |     ])
23 | }}


--------------------------------------------------------------------------------
/models/meta/re_data_monitored.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table',
 4 |         unique_key = 'table_name',
 5 |         post_hook="{% if execute %}{{ pub_insert_into_re_data_monitored() }}{% endif %}"
 6 |     )
 7 | }}
 8 | 
 9 | {{
10 |     re_data.empty_table_generic([
11 |         ('name', 'string'),
12 |         ('schema', 'string'),
13 |         ('database', 'string'),
14 |         ('time_filter', 'string'),
15 |         ('metrics_groups', 'string'),
16 |         ('additional_metrics', 'string'),
17 |         ('metrics', 'string'),
18 |         ('columns', 'string'),
19 |         ('anomaly_detector', 'string'),
20 |         ('owners', 'string'),
21 |         ('selected', 'boolean')
22 |     ])
23 | }}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/validate_ip.csv:
--------------------------------------------------------------------------------
 1 | ip_address
 2 | 1.2.3.4
 3 | 01.102.103.104
 4 | 124.171.228.4
 5 | 192.168.1.35
 6 | 192.168.1.198
 7 | 127.248.111.240
 8 | 01.1.1
 9 | 12325412
10 | notvalidatall
11 | 232.232.33
12 | 232.3232.232.232+2312
13 | ::::erwerwe
14 | ::3343:4343434343:34343:343434343:443
15 | 
16 | 2001:db8:3333:4444:5555:6666:7777:8888
17 | 2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF
18 | 2001:db8::
19 | ::1234:5678
20 | 2001:db8::1234:5678
21 | ::11.22.33.44
22 | 2001:db8::123.123.123.123
23 | 2001:db8::1234:5678:5.6.7.8
24 | 2001:db8:3333:4444:5555:6666:1.2.3.4
25 | ::11.22.33.44
26 | 2001:db8::123.123.123.123
27 | ::1234:5678:91.123.4.56
28 | ::1234:5678:1.2.3.4
29 | 2001:db8::1234:5678:5.6.7.8


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/normalizing/abbreviated_us_states.csv:
--------------------------------------------------------------------------------
 1 | state,code
 2 | Ala.,AL
 3 | Alaska,AK
 4 | Ariz.,AZ
 5 | Ark.,AR
 6 | Calif.,CA
 7 | Colo.,CO
 8 | Conn.,CT
 9 | Del.,DE
10 | D.C.,DC
11 | Fla.,FL
12 | Ga.,GA
13 | Hawaii,HI
14 | Idaho,ID
15 | Ill.,IL
16 | Ind.,IN
17 | Iowa,IA
18 | Kans.,KS
19 | Ky.,KY
20 | La.,LA
21 | Maine,ME
22 | Md.,MD
23 | Mass.,MA
24 | Mich.,MI
25 | Minn.,MN
26 | Miss.,MS
27 | Mo.,MO
28 | Mont.,MT
29 | Nebr.,NE
30 | Nev.,NV
31 | N.H.,NH
32 | N.J.,NJ
33 | N.M.,NM
34 | N.Y.,NY
35 | N.C.,NC
36 | N.D.,ND
37 | Ohio,OH
38 | Okla.,OK
39 | Ore.,OR
40 | Pa.,PA
41 | R.I.,RI
42 | S.C.,SC
43 | S.D.,SD
44 | Tenn.,TN
45 | Tex.,TX
46 | Utah,UT
47 | Vt.,VT
48 | Va.,VA
49 | Wash.,WA
50 | W.Va.,WV
51 | Wis.,WI
52 | Wyo.,WY
53 | 


--------------------------------------------------------------------------------
/integration_tests/models/monitoring/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: test_re_data_metrics
 5 |     tests:
 6 |       - dbt_utils.equality:
 7 |           compare_model: ref('expected_metrics')
 8 | 
 9 |   - name: test_re_data_z_score
10 |     tests:
11 |       - dbt_utils.equality:
12 |           compare_model: ref('expected_z_score')
13 | 
14 |   - name: test_re_data_anomalies
15 |     tests:
16 |       - dbt_utils.equality:
17 |           compare_model: ref('expected_anomalies')
18 | 
19 |   - name: test_re_data_test_history
20 |     tests:
21 |      - dbt_utils.equality:
22 |          compare_model: ref('expected_test_history')
23 | 
24 |   - name: test_re_data_table_samples
25 |     tests:
26 |      - dbt_utils.equality:
27 |          compare_model: ref('expected_table_samples')


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/validating/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: validated_emails
 5 |     tests:
 6 |       - dbt_utils.equality:
 7 |           compare_model: ref('expected_validated_emails')
 8 | 
 9 |   - name: validated_numbers
10 |     tests:
11 |       - dbt_utils.equality:
12 |           compare_model: ref('expected_validated_numbers')
13 | 
14 |   - name: validated_date_and_time
15 |     tests:
16 |       - dbt_utils.equality:
17 |           compare_model: ref('expected_validated_date_and_time')
18 | 
19 |   - name: validated_ips
20 |     tests:
21 |       - dbt_utils.equality:
22 |           compare_model: ref('expected_validated_ips')
23 |   
24 |   - name: validated_uuids
25 |     tests:
26 |       - dbt_utils.equality:
27 |           compare_model: ref('expected_validated_uuids')


--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/feature_request.md:
--------------------------------------------------------------------------------
 1 | ---
 2 | name: Feature request
 3 | about: Suggest an idea for this project
 4 | title: "[FEATURE]"
 5 | labels: ''
 6 | assignees: ''
 7 | 
 8 | ---
 9 | 
10 | **Tell us about the problem you're trying to solve**
11 | What are you trying to do, and why is it hard? A clear and concise description of what the problem is. Ex. I'm always frustrated when [...]
12 | 
13 | **Describe the solution you’d like**
14 | A clear and concise description of what you want to see happen, or the change you would like to see
15 | 
16 | **Describe the alternative you’ve considered or used**
17 | A clear and concise description of any alternative solutions or features you've considered or are using today.
18 | 
19 | **Additional context**
20 | Add any other context or screenshots about the feature request here.
21 | 


--------------------------------------------------------------------------------
/macros/utils/json/to_single_json.sql:
--------------------------------------------------------------------------------
 1 | {% macro to_json_string_value_or_null(column) %}
 2 |     (
 3 |         case 
 4 |             when {{ column }} is null then 'null'
 5 |             else '"' ||
 6 |                 regexp_replace(
 7 |                     replace(cast({{ column }} as {{ string_type() }}), '"', {{ escape_seq_for_json('"') }}),
 8 |                     '\n', {{ quote_new_line() }} {% if target.type in postgres_type_db() %}, 'g' {% endif %}
 9 |                 ) || '"'
10 |         end
11 |     )
12 | {% endmacro %}
13 | 
14 | {% macro to_single_json(columns) %}
15 |     '{' ||
16 |     {%- for column in columns %}
17 |         '"{{ column }}": ' ||
18 |         {{ to_json_string_value_or_null(column) }}
19 |         {%- if not loop.last %} || ',' || {%- endif %}
20 |     {%- endfor %}
21 |     || '}'
22 | {% endmacro %}
23 | 


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/validating/validated_numbers.sql:
--------------------------------------------------------------------------------
 1 | with 
 2 |     all_num_rows as (
 3 |         select * from {{ ref('validate_numbers') }}
 4 |     )
 5 | 
 6 | select *, 
 7 |     case when {{ re_data.valid_number('number') }} then 1 else 0 end as is_number,
 8 |     case when {{ re_data.valid_number_decimal_point('number') }} then 1 else 0 end as is_number_decimal_point,
 9 |     case when {{ re_data.valid_number_decimal_comma('number') }} then 1 else 0 end as is_number_decimal_comma,
10 |     case when {{ re_data.valid_number_percentage('number') }} then 1 else 0 end as is_percentage,
11 |     case when {{ re_data.valid_number_percentage_point('number') }} then 1 else 0 end as is_percentage_decimal_point,
12 |     case when {{ re_data.valid_number_percentage_comma('number') }} then 1 else 0 end as is_percentage_decimal_comma
13 |     from all_num_rows 
14 | 


--------------------------------------------------------------------------------
/macros/meta/save_monitored.sql:
--------------------------------------------------------------------------------
 1 | {% macro save_monitored(monitored_path) %}
 2 | 
 3 |     {% set monitored_query %}
 4 |         select
 5 |             {{ full_table_name('name', 'schema', 'database') }} as {{ re_data.quote_column('model') }},
 6 |              time_filter as {{ re_data.quote_column('time_filter') }},
 7 |             metrics as {{ re_data.quote_column('metrics') }},
 8 |             columns as {{ re_data.quote_column('columns') }},
 9 |             anomaly_detector as {{ re_data.quote_column('anomaly_detector') }},
10 |             owners as {{ re_data.quote_column('owners') }}
11 |         from {{ ref('re_data_selected') }}
12 |     {% endset %}
13 |     {% set query_result = run_query(monitored_query) %}
14 |     {% set monitored_file_path = monitored_path or 'target/re_data/monitored.json' %}
15 |     {% do query_result.to_json(monitored_file_path) %}
16 | 
17 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/metrics/base/internal_model_template.sql:
--------------------------------------------------------------------------------
 1 | {% macro re_data_last_base_metrics_part() %}
 2 | 
 3 | -- depends_on: {{ ref('re_data_columns') }}
 4 | 
 5 | {{
 6 |     config(
 7 |         materialized='table',
 8 |     )
 9 | }}
10 | 
11 | {{ re_data.empty_last_base_metrics() }}
12 | 
13 | {% endmacro %}
14 | 
15 | {% macro re_data_last_base_metrics_thread(num) %}
16 |     {% set part_name = 're_data_last_base_metrics_part' ~ num %}
17 |     {{ re_data.generate_depends(['re_data_selected', 're_data_monitored', 're_data_columns', 're_data_run_started_at', part_name]) }}
18 | 
19 |     {{
20 |         config(
21 |             materialized='table',
22 |         )
23 |     }}
24 | 
25 |     {% if not re_data.in_compile() %}
26 |         {{ re_data.metrics_base_compute_for_thread(num, part_name) }}
27 |     {% endif %}
28 | 
29 |     {{ re_data.empty_last_base_metrics() }}
30 | 
31 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/agate/row_value.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro row_value(agate_row, column) %}
 3 |     {{ return (agate_row[re_data.name_in_db(column)]) }}
 4 | {% endmacro %}
 5 | 
 6 | {% macro agate_to_list(table) %}
 7 |     {% set col_names = table.column_names %}
 8 |     {% set query_result = [] %}
 9 |     {% for row in table.rows %}
10 |         {% set pairs = [] %}
11 |         {% for col_name in col_names %}
12 |             {% set value = row.get(col_name) | string %}
13 |             {% do pairs.append('"' ~ (col_name | lower) ~ '":' ~ '"' ~ (value | replace('"', '\\\"') | replace('\n', '\\n') ) ~ '"') %}
14 |         {% endfor %}
15 |         {% set joined_pairs = '{' ~ (pairs | join(',')) ~ '}' %}
16 |         {% do query_result.append(joined_pairs) %}
17 |     {% endfor %}
18 |     {% set query_result = '[' ~ (query_result | join(',')) ~ ']' %}
19 |     {{ return (query_result) }}
20 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/validating/validated_date_and_time.sql:
--------------------------------------------------------------------------------
 1 | with 
 2 |     all_rows as (
 3 |         select * from {{ ref('validate_date_and_time') }}
 4 |     )
 5 | 
 6 | select *, 
 7 |     case when {{ re_data.valid_date_eu('date_time') }} then 1 else 0 end as valid_date_eu,
 8 |     case when {{ re_data.valid_date_us('date_time') }} then 1 else 0 end as valid_date_us,
 9 |     case when {{ re_data.valid_date_inverse('date_time') }} then 1 else 0 end as valid_date_inverse,
10 |     case when {{ re_data.valid_date_iso_8601('date_time') }} then 1 else 0 end as valid_date_iso_8601,
11 |     case when {{ re_data.valid_time_24h('date_time') }} then 1 else 0 end as valid_time_24h,
12 |     case when {{ re_data.valid_time_12h('date_time') }} then 1 else 0 end as valid_time_12h,
13 |     case when {{ re_data.valid_time('date_time') }} then 1 else 0 end as valid_time
14 |     from all_rows 
15 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/test_filters.py:
--------------------------------------------------------------------------------
 1 | from .utils.run import dbt_seed, dbt_run, dbt_test
 2 | 
 3 | def test_deduplication(db, source_schema, debug=True):
 4 |     dbt_vars = {
 5 |         'source_schema': source_schema
 6 |     }
 7 | 
 8 |     print (f"Running setup and tests for {db}")
 9 | 
10 |     dbt_seed(f'--select public_macros.filtering', db, dbt_vars)
11 |     dbt_run(f'--select deduplicated', db, dbt_vars)
12 |     dbt_test(f'--select deduplicated', db, dbt_vars)
13 | 
14 | def test_get_duplicates(db, source_schema, debug=True):
15 |     dbt_vars = {
16 |         'source_schema': source_schema
17 |     }
18 | 
19 |     print (f"Running setup and tests for {db}")
20 | 
21 |     dbt_seed(f'--select public_macros.filtering', db, dbt_vars)
22 |     dbt_run(f'--select duplicates', db, dbt_vars)
23 |     dbt_test(f'--select duplicates', db, dbt_vars)
24 | 
25 |     print (f"Running tests completed for {db}")
26 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | 
 2 | # Maybe change this if you are not running on a Mac
 3 | CONTAINER_ARCH = linux/amd64
 4 | 
 5 | .PHONY: help run-all-ci test-postgres generate-docs
 6 | 
 7 | help:
 8 | 	$(info ${HELP_MESSAGE})
 9 | 	@exit 0
10 | 
11 | 
12 | # Run GitHub Actions CI jobs locally
13 | run-all-ci: test-postgres generate-docs
14 | 	@echo "All CI steps completed."
15 | 
16 | test-postgres:
17 | 	@echo "Running test-postgres job..."
18 | 	act -j test-postgres --container-architecture $(CONTAINER_ARCH)
19 | 
20 | generate-docs:
21 | 	@echo "Running generate-docs job..."
22 | 	act -j generate-docs --container-architecture $(CONTAINER_ARCH)
23 | 
24 | 
25 | define HELP_MESSAGE
26 | Usage: $ make [TARGETS]
27 | 
28 | TARGETS
29 |   help                   Shows this help message
30 |   run-all-ci             Runs all CI steps
31 |   test-postgres          Runs test-postgres job
32 |   generate-docs          Generates documentation
33 | 
34 | endef
35 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/cleaning/expected_sample_user_data.csv:
--------------------------------------------------------------------------------
 1 | "full_name","email"
 2 | "Lizzie Effertz","*****@fakemail.com"
 3 | "Orlando Abbott","*****@fakemail.com"
 4 | "Kelley Harann","*****@fakemail.com"
 5 | "Ruth Langworth","*****@fakemail.com"
 6 | "Lane Swift","*****@fakemail.com"
 7 | "Bertha Corwin","*****@fakemail.com"
 8 | "Manuela Kling","*****@fakemail.com"
 9 | "Mose Balistreri","*****@fakemail.com"
10 | "Robin Halvorson","*****@fakemail.com"
11 | "Osbaldo Parker I","*****@fakemail.com"
12 | "Javier Runolfsson","*****@fakemail.net"
13 | "Amelia Batz","*****@fakemail.com"
14 | "Abby Pouros","*****@fakemail.com"
15 | "Markus Homenick","*****@fakemail.com"
16 | "Braeden Turner","*****@fakemail.com"
17 | "Horacio Parker","*****@fakemail.info"
18 | "Ms. Stacy Padberg","*****@fakemail.com"
19 | "Dr. Deshawn Stracke","*****@fakemail.com"
20 | "Pascale Grady","*****@fakemail.com"
21 | "Lacy Brekke","*****@fakemail.com"
22 | 


--------------------------------------------------------------------------------
/integration_tests/models/metrics/re_data_metrics.yml:
--------------------------------------------------------------------------------
 1 | 
 2 | version: 2
 3 | 
 4 | models:
 5 |   - name: re_data_metrics
 6 |     tests:
 7 |       - re_data.metric_expression_is_true:
 8 |           table: ref('buy_events')
 9 |           metric: max_length
10 |           column_name: event_type
11 |           expression: value = 3
12 | 
13 |       - re_data.metric_equal_to:
14 |           table: ref('buy_events')
15 |           metric: max_length
16 |           column_name: event_type
17 |           value: 3
18 | 
19 |       - re_data.metric_in_range:
20 |           table: ref('buy_events')
21 |           metric: max_length
22 |           column_name: event_type
23 |           min_value: 3
24 |           max_value: 3
25 | 
26 |       - re_data.metric_expression_is_true:
27 |           table: ref('buy_events')
28 |           metric: row_count
29 |           expression: value > 0 and value < 10
30 |           condition: time_window_start >= '2021-05-02'


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/validating/expected_validated_ips.csv:
--------------------------------------------------------------------------------
 1 | ip_address,valid_ip_v4,valid_ip_v6,valid_ip
 2 | 1.2.3.4,1,0,1
 3 | 01.102.103.104,1,0,1
 4 | 124.171.228.4,1,0,1
 5 | 192.168.1.35,1,0,1
 6 | 192.168.1.198,1,0,1
 7 | 127.248.111.240,1,0,1
 8 | 01.1.1,0,0,0
 9 | 12325412,0,0,0
10 | notvalidatall,0,0,0
11 | 232.232.33,0,0,0
12 | 232.3232.232.232+2312,0,0,0
13 | ::::erwerwe,0,0,0
14 | ::3343:4343434343:34343:343434343:443,0,0,0
15 | ,0,0,0
16 | 2001:db8:3333:4444:5555:6666:7777:8888,0,1,1
17 | 2001:db8:3333:4444:CCCC:DDDD:EEEE:FFFF,0,1,1
18 | 2001:db8::,0,1,1
19 | ::1234:5678,0,1,1
20 | 2001:db8::1234:5678,0,1,1
21 | ::11.22.33.44,0,1,1
22 | 2001:db8::123.123.123.123,0,1,1
23 | 2001:db8::1234:5678:5.6.7.8,0,1,1
24 | 2001:db8:3333:4444:5555:6666:1.2.3.4,0,1,1
25 | ::11.22.33.44,0,1,1
26 | 2001:db8::123.123.123.123,0,1,1
27 | ::1234:5678:91.123.4.56,0,1,1
28 | ::1234:5678:1.2.3.4,0,1,1
29 | 2001:db8::1234:5678:5.6.7.8,0,1,1
30 | 


--------------------------------------------------------------------------------
/macros/public/store/export_tests_history.sql:
--------------------------------------------------------------------------------
 1 | {% macro export_tests_history(start_date, end_date, tests_history_path=None) %}
 2 |     {% set tests_history_query %}
 3 |         select
 4 |             table_name,
 5 |             column_name,
 6 |             test_name,
 7 |             run_at,
 8 |             status,
 9 |             execution_time, 
10 |             message, 
11 |             failures_count, 
12 |             failures_json, 
13 |             failures_table,
14 |             severity, 
15 |             compiled_sql
16 |         from
17 |             {{ ref('re_data_test_history') }}
18 |         where {{ in_date_window('run_at', start_date, end_date) }}
19 |     {% endset %}
20 | 
21 |     {% set query_result = run_query(tests_history_query) %}
22 |     {% set tests_history_file_path = tests_history_path or 'target/re_data/tests_history.json' %}
23 |     {% do query_result.to_json(tests_history_file_path) %}
24 | 
25 | {% endmacro %}
26 | 


--------------------------------------------------------------------------------
/integration_tests/macros/my_metrics.sql:
--------------------------------------------------------------------------------
 1 | {% macro re_data_metric_diff(context) %}
 2 |     max({{context.column_name}}) - min({{context.column_name}})
 3 | {% endmacro %}
 4 | 
 5 | {% macro re_data_metric_my_custom_table_metric(context) %}
 6 |     1000
 7 | {% endmacro %}
 8 | 
 9 | {% macro re_data_metric_regex_test(context) %}
10 |     {{ regex_test(context.column_name, context.config.regex) }}
11 | {% endmacro %}
12 | 
13 | {% macro regex_test(column_name, pattern) %}
14 |     coalesce(
15 |         sum(
16 |             case when {{ re_data.regex_match_expression(column_name, pattern) }}
17 |                 then 1
18 |             else 0
19 |             end
20 |         ), 0
21 |     )
22 | {% endmacro %}
23 | 
24 | {% macro re_data_metric_my_distinct_table_rows(context) %}
25 |     with temp_table AS (
26 |             select distinct * from {{ context.table_name }}
27 |         )
28 |     select coalesce(count(*), 0) FROM temp_table
29 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/public/filtering/get_duplicates.sql:
--------------------------------------------------------------------------------
 1 | {% https://github.com/re-data/re-data/issues/143 %}
 2 | 
 3 | {#
 4 | macro returns rows with she same key set (unique_cols)
 5 | 
 6 | along with the fields of the base model duplicates information added:
 7 | re_data_duplicate_count - total number of duplicates with the same current key set
 8 | re_data_duplicate_row_number - number of a duplicate row inside the group of duplicates with the same current key set
 9 | #}
10 | 
11 | {% macro filter_get_duplicates(relation, unique_cols, sort_columns) %}
12 |     (
13 |         with duplication_context as (
14 |             {{re_data.add_duplication_context(relation, unique_cols, sort_columns)}}
15 |         ),
16 |         duplicate_rows as (
17 |             select * from duplication_context where re_data_duplicate_group_row_count > 1
18 |         )
19 |         {# return surrogate key as well? #}
20 |         select *
21 |         from duplicate_rows
22 |     ) 
23 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/public/cleaning/clean_additional_whitespace.sql:
--------------------------------------------------------------------------------
 1 | {% macro clean_additional_whitespaces(column_name) %}
 2 |     {{ adapter.dispatch('clean_additional_whitespaces', 're_data')(column_name) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro default__clean_additional_whitespaces(column_name) %}
 6 |     trim(regexp_replace( {{ column_name }}, '\s\s+', ' '))
 7 | {% endmacro %}
 8 | 
 9 | {% macro postgres__clean_additional_whitespaces(column_name) %}
10 |     trim(regexp_replace( {{ column_name }}, '\s\s+', ' ', 'g'))
11 | {% endmacro %}
12 | 
13 | {% macro redshift__clean_additional_whitespaces(column_name) %}
14 |     trim(regexp_replace( {{ column_name }}, '\\s\\s+', ' '))
15 | {% endmacro %}
16 | 
17 | {% macro bigquery__clean_additional_whitespaces(column_name) %}
18 |     trim(regexp_replace( {{ column_name }}, r'\s\s+', ' '))
19 | {% endmacro %}
20 | 
21 | {% macro snowflake__clean_additional_whitespaces(column_name) %}
22 |     trim(regexp_replace( {{ column_name }}, '\\s\\s+', ' '))
23 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/filtering/expected_deduplicated.csv:
--------------------------------------------------------------------------------
 1 | transaction_id,creation_time,status,value,use_case
 2 | 1,2021-05-01 12:31:32,pending,100,take_first
 3 | 2,2021-05-01 12:35:35,pending,200,take_first
 4 | 3,2021-05-01 12:40:35,pending,300,take_first
 5 | 4,2021-05-02 12:35:35,completed,10,take_first
 6 | 5,2021-05-02 12:40:35,pending,100,take_first
 7 | 1,2021-05-01 12:40:35,completed,100,take_last
 8 | 2,2021-05-01 12:35:35,pending,200,take_last
 9 | 3,2021-05-02 12:31:32,completed,300,take_last
10 | 4,2021-05-02 12:40:35,completed,40,take_last
11 | 5,2021-05-02 12:40:35,pending,100,take_last
12 | 1,2021-05-01 12:40:35,completed,100,take_all_statuses
13 | 1,2021-05-01 12:31:32,pending,100,take_all_statuses
14 | 2,2021-05-01 12:35:35,pending,200,take_all_statuses
15 | 3,2021-05-02 12:31:32,completed,300,take_all_statuses
16 | 3,2021-05-01 12:40:35,pending,300,take_all_statuses
17 | 4,2021-05-02 12:40:35,completed,40,take_all_statuses
18 | 5,2021-05-02 12:40:35,pending,100,take_all_statuses
19 | 


--------------------------------------------------------------------------------
/integration_tests/python_tests/utils/run.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import yaml
 3 | 
 4 | def dbt_command(command, for_db, dbt_vars, threads=None):
 5 |     debug = 'DBT_MACRO_DEBUGGING=1 '
 6 |     profile_part = f' --profile re_data_{for_db}'
 7 |     yaml_vars = yaml.dump(dbt_vars)
 8 |     cmd = f'{debug} {command} --vars "{yaml_vars}" {profile_part}'
 9 |     if threads:
10 |         cmd += f' --threads {threads}'
11 |     assert os.system(cmd) == 0
12 | 
13 | def dbt_seed(args, for_db, dbt_vars):
14 |     dbt_command(f'dbt seed --full-refresh {args}', for_db, dbt_vars, threads=4)
15 | 
16 | def dbt_run(args, for_db, dbt_vars):
17 |     dbt_command(f'dbt run --full-refresh --fail-fast {args}', for_db, dbt_vars, threads=4)
18 | 
19 | def dbt_test(args, for_db, dbt_vars):
20 |     dbt_command(f'dbt test --store-failures --fail-fast {args}', for_db, dbt_vars, threads=4)
21 | 
22 | def dbt_build(args, for_db, dbt_vars):
23 |     dbt_command(f'dbt build --full-refresh --store-failures --fail-fast {args}', for_db, dbt_vars, threads=4)
24 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/cleaning/sample_user_data.csv:
--------------------------------------------------------------------------------
 1 | full_name,email
 2 |   lizzie effertz,torp.trisha@fakemail.com
 3 |  orlando abbott,dayton.hermiston@fakemail.com
 4 | kelley     harann,borer.blake@fakemail.com
 5 | ruth   langworth,garett66@fakemail.com
 6 | lane swift  ,nienow.coralie@fakemail.com
 7 | bertha corwin ,tstroman@fakemail.com
 8 | manuela   kling,shawn.langworth@fakemail.com
 9 | mose balistreri,dorris70@fakemail.com
10 | robin    halvorson,murazik.americo@fakemail.com
11 |      osbaldo parker i  ,friesen.angeline@fakemail.com
12 | javier runolfsson  ,benjamin.bailey@fakemail.net
13 | amelia batz,garrison60@fakemail.com
14 | abby  pouros,dominique.leannon@fakemail.com
15 | markus homenick,piper73@fakemail.com
16 | braeden turner,kozey.jace@fakemail.com
17 | horacio   parker,vtillman@fakemail.info
18 | ms. stacy       padberg,erdman.elaina@fakemail.com
19 | dr.     deshawn stracke,rosendo.beer@fakemail.com
20 |   pascale grady,princess60@fakemail.com
21 | lacy     brekke,romaguera.darrell@fakemail.com
22 | 


--------------------------------------------------------------------------------
/macros/meta/information_schema.sql:
--------------------------------------------------------------------------------
 1 | {% macro get_monitored_columns(schema, database) %}
 2 |     {{ adapter.dispatch('get_monitored_columns', 're_data')(schema, database) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro default__get_monitored_columns(table_schema, db_name) %}
 6 |     {% set relation = api.Relation.create(database=db_name, schema=table_schema) %}
 7 |     select
 8 |         table_name,
 9 |         table_schema,
10 |         table_catalog,
11 |         column_name,
12 |         data_type,
13 |         is_nullable
14 |     from
15 |         {{ relation.information_schema('COLUMNS') }}
16 |     where
17 |         table_schema = '{{ table_schema }}'
18 | {% endmacro %}
19 | 
20 | {% macro redshift__get_monitored_columns(table_schema, db_name) %}
21 |     select
22 |         table_name,
23 |         table_schema,
24 |         table_catalog,
25 |         column_name,
26 |         data_type,
27 |         is_nullable
28 |     from
29 |         svv_columns
30 |     where
31 |         table_schema = '{{ table_schema }}'
32 | {% endmacro %}
33 | 


--------------------------------------------------------------------------------
/models/metrics/types/schema/re_data_columns_over_time.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key = 'id',
 5 |         on_schema_change='sync_all_columns',
 6 |     )
 7 | }}
 8 | 
 9 | 
10 | with columns as (
11 | 
12 | select
13 |     {{ full_table_name('cols.name', 'cols.schema', 'cols.database') }} as table_name,
14 |     cols.column_name,
15 |     cols.data_type,
16 |     cols.is_nullable,
17 |     cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }} ) as detected_time
18 | from
19 |     {{ ref('re_data_columns')}} cols, {{ ref('re_data_selected')}} tables
20 | where
21 |     cols.name = tables.name and cols.schema = tables.schema and cols.database = tables.database
22 | )
23 | 
24 | select
25 |     cast ({{ dbt_utils.generate_surrogate_key([
26 |       'table_name',
27 |       'column_name',
28 |       'detected_time'
29 |     ]) }} as {{ string_type() }} ) as id,
30 |     table_name,
31 |     column_name,
32 |     data_type,
33 |     is_nullable,
34 |     detected_time
35 | from columns


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/normalizing/us_states_normalization.csv:
--------------------------------------------------------------------------------
 1 | source,target
 2 | Ala.,Alabama
 3 | Alaska,Alaska
 4 | Ariz.,Arizona
 5 | Ark.,Arkansas
 6 | Calif.,California
 7 | Colo.,Colorado
 8 | Conn.,Connecticut
 9 | Del.,Delaware
10 | D.C.,District of Columbia
11 | Fla.,Florida
12 | Ga.,Georgia
13 | Hawaii,Hawaii
14 | Idaho,Idaho
15 | Ill.,Illinois
16 | Ind.,Indiana
17 | Iowa,Iowa
18 | Kans.,Kansas
19 | Ky.,Kentucky
20 | La.,Louisiana
21 | Maine,Maine
22 | Md.,Maryland
23 | Mass.,Massachusetts
24 | Mich.,Michigan
25 | Minn.,Minnesota
26 | Miss.,Mississippi
27 | Mo.,Missouri
28 | Mont.,Montana
29 | Nebr.,Nebraska
30 | Nev.,Nevada
31 | N.H.,New Hampshire
32 | N.J.,New Jersey
33 | N.M.,New Mexico
34 | N.Y.,New York
35 | N.C.,North Carolina
36 | N.D.,North Dakota
37 | Ohio,Ohio
38 | Okla.,Oklahoma
39 | Ore.,Oregon
40 | Pa.,Pennsylvania
41 | R.I.,Rhode Island
42 | S.C.,South Carolina
43 | S.D.,South Dakota
44 | Tenn.,Tennessee
45 | Tex.,Texas
46 | Utah,Utah
47 | Vt.,Vermont
48 | Va.,Virginia
49 | Wash.,Washington
50 | W.Va.,West Virginia
51 | Wis.,Wisconsin
52 | Wyo.,Wyoming


--------------------------------------------------------------------------------
/models/alerts/re_data_alerts.sql:
--------------------------------------------------------------------------------
 1 | select
 2 |     'anomaly' as type,
 3 |     {{ re_data.clean_blacklist('table_name', ['"', '`'], '') }} as model,
 4 |     message,
 5 |     last_value_text as value,
 6 |     time_window_end
 7 | from
 8 |     {{ ref(var('re_data:re_data_anomalies_filtered')) }}
 9 | union all
10 | 
11 | select
12 |     'schema_change' as type,
13 |     {{ re_data.clean_blacklist('table_name', ['"', '`'], '') }} as model,
14 |     {{ generate_schema_change_message('operation', 'column_name', 'prev_column_name', 'prev_data_type', 'data_type', 'detected_time') }} as message,
15 |     '' as value,
16 |     detected_time as time_window_end
17 | from {{ ref('re_data_schema_changes') }}
18 | 
19 | union all
20 | 
21 | select
22 |     'test' as type,
23 |     table_name as model,
24 |     {{ generate_failed_test_message('test_name', 'column_name') }},
25 |     status as value,
26 |     run_at as time_window_end
27 | 
28 | from {{ ref('re_data_test_history') }}
29 | where
30 |     status = 'Fail'
31 |     or status = 'Error'
32 |     {% if var('re_data:show_warns_as_alerts') %}
33 |     or status = 'Warn'
34 |     {% endif %}
35 | 


--------------------------------------------------------------------------------
/macros/public/store/export_alerts.sql:
--------------------------------------------------------------------------------
 1 | {% macro export_alerts(start_date, end_date, alerts_path=None, monitored_path=None) %}
 2 |     {% set alerts_query %}
 3 |         select
 4 |             type as {{ re_data.quote_column('type') }},
 5 |             model as {{ re_data.quote_column('model') }},
 6 |             message as {{ re_data.quote_column('message') }},
 7 |             value as {{ re_data.quote_column('value') }},
 8 |             {{ format_timestamp('time_window_end')}} as {{ re_data.quote_column('time_window_end') }} 
 9 |         from {{ ref('re_data_alerts') }}
10 |         where
11 |             case
12 |                 when type = 'anomaly' then {{ in_date_window('time_window_end', start_date, end_date) }}
13 |                 else {{ in_date_window('time_window_end', start_date, none) }}
14 |             end
15 |         order by time_window_end desc
16 |     {% endset %}
17 | 
18 |     {% set query_result = run_query(alerts_query) %}
19 |     {% set alerts_file_path = alerts_path or 'target/re_data/alerts.json' %}
20 |     {% do query_result.to_json(alerts_file_path) %}
21 |     {{ save_monitored(monitored_path) }}
22 | {% endmacro %}
23 | 


--------------------------------------------------------------------------------
/integration_tests/macros/create_test_source_tables.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {% macro create_test_source_tables() %}
 4 | 
 5 |     {% set create_table %}
 6 |         CREATE SCHEMA IF NOT EXISTS {{target.schema}};
 7 |         DROP TABLE IF EXISTS {{target.schema}}.re_data_source_test_table;
 8 |         CREATE TABLE IF NOT EXISTS {{target.schema}}.re_data_source_test_table (
 9 |             number {{ re_data.integer_type() }},
10 |             description {{ re_data.string_type() }},
11 |             created_at {{ re_data.timestamp_type() }}
12 |         );
13 |         INSERT INTO {{target.schema}}.re_data_source_test_table (number, description, created_at) VALUES 
14 |             (1, 'one', current_timestamp),
15 |             (2, 'two', current_timestamp),
16 |             (3, 'three', current_timestamp),
17 |             (4, 'four', current_timestamp),
18 |             (5, 'five', current_timestamp),
19 |             (6, 'six', current_timestamp),
20 |             (7, 'seven', current_timestamp),
21 |             (8, 'eight', current_timestamp),
22 |             (9, 'nine', current_timestamp),
23 |             (10, 'ten', current_timestamp
24 |         );
25 |     {% endset %}
26 |     {% do run_query(create_table) %}
27 | 
28 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/fivetran_utils/json_extract.sql:
--------------------------------------------------------------------------------
 1 | {#
 2 | # This file contains significant part of code derived from 
 3 | # https://github.com/fivetran/dbt_fivetran_utils/tree/v0.4.0 which is licensed under Apache License 2.0.
 4 | #}
 5 | 
 6 | {% macro json_extract(string, string_path) -%}
 7 | 
 8 | {{ adapter.dispatch('json_extract','re_data') (string, string_path) }}
 9 | 
10 | {%- endmacro %}
11 | 
12 | {% macro default__json_extract(string, string_path) %}
13 | 
14 |   json_extract_path_text({{string}}, {{ "'" ~ string_path ~ "'" }} )
15 |  
16 | {% endmacro %}
17 | 
18 | {% macro snowflake__json_extract(string, string_path) %}
19 | 
20 |   json_extract_path_text(try_parse_json( {{string}} ), {{ "'" ~ string_path ~ "'" }} )
21 | 
22 | {% endmacro %}
23 | 
24 | {% macro redshift__json_extract(string, string_path) %}
25 | 
26 |   case when is_valid_json( {{string}} ) then json_extract_path_text({{string}}, {{ "'" ~ string_path ~ "'" }} ) else null end
27 |  
28 | {% endmacro %}
29 | 
30 | {% macro bigquery__json_extract(string, string_path) %}
31 | 
32 |   json_extract_scalar({{string}}, {{ "'$." ~ string_path ~ "'" }} )
33 | 
34 | {% endmacro %}
35 | 
36 | {% macro postgres__json_extract(string, string_path) %}
37 | 
38 |   {{string}}::json->>{{"'" ~ string_path ~ "'" }}
39 | 
40 | {% endmacro %}
41 | 


--------------------------------------------------------------------------------
/macros/public/normalizing/normalize_values.sql:
--------------------------------------------------------------------------------
 1 | {% macro is_dbt_relation(obj) %}
 2 |     {{ return (obj is mapping and obj.get('metadata', {}).get('type', '').endswith('Relation') )}}
 3 | {% endmacro %}
 4 | 
 5 | {% macro normalize_expression_cte(reference_table) %}
 6 |     with target_table as (
 7 |         {% if re_data.is_dbt_relation(reference_table) or reference_table is string %}
 8 |             select * from {{ reference_table }}
 9 |         {% elif reference_table is mapping %}
10 |             {% for key, value in reference_table.items() %}
11 |                 select '{{key}}' as source, '{{value}}' as target
12 |                 {% if not loop.last %}union all{% endif %}
13 |             {% endfor %}
14 |         {% endif %}
15 |     )
16 | {% endmacro %}
17 | 
18 | {%- macro normalize_values(source_relation, column_name, reference_table) -%}
19 |     ( 
20 |         {{ re_data.normalize_expression_cte(reference_table) }}
21 |         
22 |         select s.*, 
23 |         case when t.source is null
24 |                 then s.{{column_name}}
25 |             else t.target
26 |             end as {{ column_name + '__normalized'}} 
27 |         from {{ source_relation }} s
28 |         left join target_table t 
29 |         on t.source = s.{{column_name}}
30 |     )
31 | {%- endmacro -%}
32 | 
33 | 


--------------------------------------------------------------------------------
/macros/utils/monitored_config.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {% macro columns_in_db(columns) %}
 4 |     {% set translated = [] %}
 5 |     {% if columns is none %}
 6 |         {{ return (none) }}
 7 |     {% endif %}
 8 |     {% for col in columns %}
 9 |         {% do translated.append(re_data.name_in_db(col))%}
10 |     {% endfor %}
11 |     {{ return (translated) }}
12 | {% endmacro %}
13 | 
14 | {% macro metrics_in_db(metrics) %}
15 |     {% set translated = metrics %}
16 |     {% set column_metrics = {} %}
17 |     {% for col in metrics.column %}
18 |         {% do column_metrics.update({re_data.name_in_db(col): metrics.column[col]}) %}
19 |     {% endfor %}
20 |     {% if column_metrics %}
21 |         {% do metrics.update({'column': column_metrics}) %}
22 |     {% endif %}
23 |     {{ return (metrics) }}
24 | {% endmacro %}
25 | 
26 | {% macro final_metrics(metrics_groups, additional_metrics) %}
27 |     {% set final_metrics_dict = dict([('group', {}), ('additional', {})]) %}
28 |     {% set all_metrics_groups = var('re_data:metrics_groups')%}
29 | 
30 |     {% for group in metrics_groups %}
31 |         {% set value = all_metrics_groups.get(group) %}
32 |         {% do final_metrics_dict['group'].update(value) %}
33 |     {% endfor %}
34 | 
35 |     {% do final_metrics_dict['additional'].update(additional_metrics) %}
36 |     {{ return (final_metrics_dict) }}
37 | 
38 | {% endmacro %}


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # What is re_data?
 2 | 
 3 | re_data is an open-source data reliability framework for the modern data stack. 😊
 4 | 
 5 | Currently, re_data focuses on observing the dbt project (together with underlying data warehouse - Postgres, BigQuery, Snowflake, Redshift).
 6 | 
 7 | Data transformations in re_data are implemented and exposed as models & macros in this dbt package.
 8 | 
 9 | # Live demo
10 | 
11 | Check out our **[live demo](https://docs.getre.io/ui-latest)** of what re_data can do for you 😊
12 | 
13 | # Getting started
14 | 
15 | [Check our docs!](https://docs.getre.io/) 🙂
16 | 
17 | [Join re_data community on Slack](https://join.slack.com/t/re-data/shared_invite/zt-vkauq1y8-tL4R4_H5nZoVvyXyy0hdug) (we are very responsive there)
18 | 
19 | [Check out more info, issues, etc. in master repo](https://github.com/re-data/re-data)
20 | 
21 | # Community
22 | 
23 | Say, hi to us on! 🙂
24 | 
25 | - [Slack](https://join.slack.com/t/re-data/shared_invite/zt-vkauq1y8-tL4R4_H5nZoVvyXyy0hdug)
26 | - [Twitter](https://twitter.com/re_data_labs)
27 | - [LinkedIn](https://www.linkedin.com/company/74608627/)
28 | 
29 | # Contributing
30 | 
31 | Any contributions are greatly appreciated! Most of our documentation and GitHub issues are managed in the primary [re-data](https://github.com/re-data/re-data) repo. See the Contributing section in [re-data](https://github.com/re-data/re-data) for details.
32 | 


--------------------------------------------------------------------------------
/macros/metrics/base/build_in/table_default.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro re_data_metric_row_count(context) %}
 3 |     count(1)
 4 | {% endmacro %}
 5 | 
 6 | {% macro re_data_metric_freshness(context) %}
 7 |     {{ freshness_expression(context.time_filter) }}
 8 | {% endmacro %}
 9 | 
10 | {% macro freshness_expression(time_filter) %}
11 |     {# /* If time_filter is none, we are computing the metric globally. we set the value as null since a table without use of a time filter doesn't really have a freshness metric */ #}
12 |     {% if time_filter is none %}
13 |         cast(null as {{ numeric_type() }})
14 |     {% else %}
15 |         {{ adapter.dispatch('freshness_expression', 're_data')(time_filter) }}
16 |     {% endif %}
17 | {% endmacro %}
18 | 
19 | {% macro default__freshness_expression(time_filter) %}
20 |     EXTRACT(EPOCH FROM ({{time_window_end()}} - max({{time_filter}})))
21 | {% endmacro %}
22 | 
23 | {% macro bigquery__freshness_expression(time_filter) %}
24 |     TIMESTAMP_DIFF ( timestamp({{ time_window_end()}}), timestamp(max({{time_filter}})), SECOND)
25 | {% endmacro %}
26 | 
27 | {% macro snowflake__freshness_expression(time_filter) %}
28 |     timediff(second, max({{time_filter}}), {{- time_window_end() -}})
29 | {% endmacro %}
30 | 
31 | {% macro redshift__freshness_expression(time_filter) %}
32 |     DATEDIFF(second, max({{time_filter}}), {{- time_window_end() -}})
33 | {% endmacro %}
34 | 


--------------------------------------------------------------------------------
/integration_tests/models/transformed/schema.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: buy_events
 5 |     tests:
 6 |       - re_data.assert_in_range:
 7 |           metric: row_count
 8 |           min_value: 0
 9 |           max_value: 10
10 | 
11 |       - re_data.assert_equal:
12 |           metric: row_count
13 |           value: 1
14 | 
15 |       - re_data.assert_false:
16 |           metric: freshness
17 |           expression: value is null
18 |       
19 |       - re_data.assert_greater_equal:
20 |           metric: my_distinct_table_rows
21 |           value: 10
22 | 
23 |     columns:
24 |       - name: value1
25 |         tests:
26 |           - re_data.assert_in_range:
27 |               metric: nulls_percent
28 |               min_value: 0
29 |               max_value: 10
30 | 
31 |           - re_data.assert_true:
32 |               metric: nulls_percent
33 |               expression: value = 0
34 |           
35 |           - re_data.assert_less:
36 |               metric: min
37 |               value: 100
38 |               condition: time_window_start = '2021-05-02'
39 |           
40 |           - re_data.assert_less_equal:
41 |               metric: min
42 |               value: 107
43 | 
44 |       - name: value2
45 |         tests:
46 |           - re_data.assert_greater_equal:
47 |               metric: min
48 |               value: 200
49 |               condition: time_window_start = '2021-05-02'
50 | 


--------------------------------------------------------------------------------
/macros/public/cleaning/clean_blacklist.sql:
--------------------------------------------------------------------------------
 1 | {% macro generate_blacklist_pattern(chars_to_blacklist) %}
 2 |     {% set pattern = [] %}
 3 |     {% for char in chars_to_blacklist %}
 4 |         {% set expr = '(' + char + ')' %}
 5 |         {% do pattern.append(expr) %}
 6 |     {% endfor %}
 7 | 
 8 |     {{ return(pattern | join('|')) }}
 9 | {% endmacro %}
10 | 
11 | {%- macro clean_blacklist(column_name, chars_to_blacklist, replacement) -%}
12 |     {% set pattern_string = re_data.generate_blacklist_pattern(chars_to_blacklist) %}
13 | 
14 |     {{ adapter.dispatch('clean_blacklist', 're_data')(column_name, pattern_string, replacement) }}
15 | {%- endmacro -%}
16 | 
17 | {%- macro default__clean_blacklist(column_name, pattern_string, replacement) -%}
18 |     regexp_replace( {{ column_name }}, '{{ pattern_string }}', '{{ replacement }}')
19 | {%- endmacro -%}
20 | 
21 | {%- macro postgres__clean_blacklist(column_name, pattern_string, replacement) -%}
22 |     regexp_replace( {{ column_name }}, '{{ pattern_string }}', '{{ replacement }}', 'g')
23 | {%- endmacro -%}
24 | 
25 | {%- macro redshift__clean_blacklist(column_name, pattern_string, replacement) -%}
26 |     regexp_replace( {{ column_name }}, '{{ pattern_string }}', '{{ replacement }}')
27 | {%- endmacro -%}
28 | 
29 | {%- macro bigquery__clean_blacklist(column_name, pattern_string, replacement) -%}
30 |     regexp_replace( {{ column_name }}, """{{ pattern_string }}""", '{{ replacement }}')
31 | {%- endmacro -%}


--------------------------------------------------------------------------------
/models/meta/re_data_columns.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='table',
 4 |     )
 5 | }}
 6 | 
 7 | -- depends_on: {{ ref('re_data_run_started_at') }}
 8 | -- depends_on: {{ ref('re_data_monitored') }}
 9 | -- depends_on: {{ ref('re_data_selected') }}
10 | 
11 | {% if execute %}
12 |     {% set schemas = run_query(re_data.get_schemas()) %}
13 |     {% if schemas %}
14 | 
15 |     with columns_from_select as (
16 |         {% for row in schemas %}
17 |             {% set schema_name = re_data.name_in_db(re_data.row_value(row, 'schema')) %}
18 |             {{ get_monitored_columns(schema_name, re_data.row_value(row, 'database')) }}
19 |         {%- if not loop.last %} union all {%- endif %}
20 |         {% endfor %}
21 |     )
22 | 
23 |     select
24 |         cast (table_name as {{ string_type() }} ) as name,
25 |         cast (table_schema as {{ string_type() }} ) as schema,
26 |         cast (table_catalog as {{ string_type() }} ) as database,
27 |         cast (column_name as {{ string_type() }} ) as column_name,
28 |         cast (data_type as {{ string_type() }} ) as data_type,
29 |         cast (case is_nullable when 'YES' then 1 else 0 end as {{ boolean_type() }} ) as is_nullable,
30 |         cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as computed_on
31 |     from columns_from_select
32 | 
33 |     {% else %}
34 |         {{ empty_columns_table() }}
35 |     {% endif %}
36 | 
37 | {% else %}
38 |     {{ empty_columns_table() }}
39 | {% endif %}
40 | 


--------------------------------------------------------------------------------
/macros/meta/table_name.sql:
--------------------------------------------------------------------------------
 1 | {% macro full_table_name(table_name, table_schema, table_catalog) %}
 2 |     {{ adapter.dispatch('full_table_name', 're_data')(table_name, table_schema, table_catalog) }}
 3 | {% endmacro %}
 4 | 
 5 | 
 6 | {% macro default__full_table_name(table_name, table_schema, table_catalog) %}
 7 |     '"' || {{table_catalog}} || '"' || '.' || '"' || {{table_schema}} || '"' || '.' || '"' || {{table_name}} || '"'
 8 | {% endmacro %}
 9 | 
10 | 
11 | {% macro bigquery__full_table_name(table_name, table_schema, table_catalog) %}
12 |     '`' || {{table_catalog}} || '`' || '.' || '`' || {{table_schema}} || '`' || '.' || '`' || {{table_name}} || '`'
13 | {% endmacro %}
14 | 
15 | 
16 | {% macro full_table_name_values(table_name, table_schema, table_catalog) %}
17 |     {% set result = adapter.dispatch('full_table_name_values', 're_data')(table_name, table_schema, table_catalog) %}
18 |     {{ return (result.strip()) }}
19 | {% endmacro %}
20 | 
21 | {% macro default__full_table_name_values(table_name, table_schema, table_catalog) %}
22 |     "{{table_catalog}}"."{{table_schema}}"."{{table_name}}"
23 | {% endmacro %}
24 | 
25 | 
26 | {% macro bigquery__full_table_name_values(table_name, table_schema, table_catalog) %}
27 |     `{{table_catalog}}`.`{{table_schema}}`.`{{table_name}}`
28 | {% endmacro %}
29 | 
30 | 
31 | {% macro snowflake__full_table_name_values(table_name, table_schema, table_catalog) %}
32 |     "{{table_catalog|upper}}"."{{table_schema|upper}}"."{{table_name|upper}}"
33 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/seeds/public_macros/normalizing/expected_us_states_normalized.csv:
--------------------------------------------------------------------------------
 1 | "state","code","state__normalized"
 2 | "Ariz.","AZ","Arizona"
 3 | "Ind.","IN","Indiana"
 4 | "La.","LA","Louisiana"
 5 | "W.Va.","WV","West Virginia"
 6 | "Nebr.","NE","Nebraska"
 7 | "Pa.","PA","Pennsylvania"
 8 | "Iowa","IA","Iowa"
 9 | "N.H.","NH","New Hampshire"
10 | "S.C.","SC","South Carolina"
11 | "Ore.","OR","Oregon"
12 | "Conn.","CT","Connecticut"
13 | "R.I.","RI","Rhode Island"
14 | "Minn.","MN","Minnesota"
15 | "D.C.","DC","District of Columbia"
16 | "Wyo.","WY","Wyoming"
17 | "Hawaii","HI","Hawaii"
18 | "Wash.","WA","Washington"
19 | "N.D.","ND","North Dakota"
20 | "Mass.","MA","Massachusetts"
21 | "N.Y.","NY","New York"
22 | "N.M.","NM","New Mexico"
23 | "Colo.","CO","Colorado"
24 | "Ohio","OH","Ohio"
25 | "Idaho","ID","Idaho"
26 | "Ala.","AL","Alabama"
27 | "Ark.","AR","Arkansas"
28 | "S.D.","SD","South Dakota"
29 | "Mo.","MO","Missouri"
30 | "N.J.","NJ","New Jersey"
31 | "Miss.","MS","Mississippi"
32 | "Kans.","KS","Kansas"
33 | "Vt.","VT","Vermont"
34 | "Calif.","CA","California"
35 | "Mich.","MI","Michigan"
36 | "Alaska","AK","Alaska"
37 | "Nev.","NV","Nevada"
38 | "Okla.","OK","Oklahoma"
39 | "Tenn.","TN","Tennessee"
40 | "Ga.","GA","Georgia"
41 | "Wis.","WI","Wisconsin"
42 | "Ky.","KY","Kentucky"
43 | "N.C.","NC","North Carolina"
44 | "Mont.","MT","Montana"
45 | "Fla.","FL","Florida"
46 | "Va.","VA","Virginia"
47 | "Tex.","TX","Texas"
48 | "Md.","MD","Maryland"
49 | "Utah","UT","Utah"
50 | "Maine","ME","Maine"
51 | "Del.","DE","Delaware"
52 | "Ill.","IL","Illinois"
53 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 redata-team
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 
24 | ========================================================================
25 | Third party Apache 2.0 licenses
26 | ========================================================================
27 | 
28 | The following files contain signifant parts of code licensed under third party Apache 2.0 License.
29 | 
30 | macros/public/validating/regex_dict.sql 
31 | macros/utils/fivetran_utils/percentile.sql
32 | macros/utils/fivetran_utils/json_extract.sql
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/.github/workflows/generate-docs.yml:
--------------------------------------------------------------------------------
 1 | name: Generate Documentation
 2 | 
 3 | on: [push]
 4 | 
 5 | env:
 6 |   DBT_PROFILES_DIR: ./
 7 |   DBT_VERSION: 1.7
 8 |   PYTHON_VERSION: "3.8.x"
 9 | 
10 | jobs:
11 |   generate-docs:
12 |     runs-on: ubuntu-latest
13 |     if: github.event_name == 'push'
14 |     services:
15 |       postgres:
16 |         image: postgres
17 |         env:
18 |           POSTGRES_PASSWORD: postgres
19 |         # Set health checks to wait until postgres has started
20 |         options: >-
21 |           --health-cmd pg_isready
22 |           --health-interval 10s
23 |           --health-timeout 5s
24 |           --health-retries 5
25 |         ports:
26 |           # Maps tcp port 5432 on service container to the host
27 |           - 5432:5432
28 |     steps:        
29 |       - name: Check out
30 |         uses: actions/checkout@v2
31 | 
32 |       - uses: actions/setup-python@v4
33 |         with:
34 |           python-version: ${{ env.PYTHON_VERSION }}
35 | 
36 |       - name: Install dependencies and run
37 |         run: |
38 |           pip install protobuf==4.25.3 dbt-postgres==$DBT_VERSION
39 |           dbt deps
40 |           dbt run
41 | 
42 |       - name: Generate Documentation
43 |         run: dbt docs generate
44 | 
45 |       - name: Copy files
46 |         if: github.ref == 'refs/heads/main'
47 |         run: 'mkdir docs && cp target/{catalog.json,index.html,manifest.json,run_results.json} docs/'
48 |         shell: bash
49 | 
50 |       - name: Deploy
51 |         uses: peaceiris/actions-gh-pages@v3
52 |         if: github.ref == 'refs/heads/main' && !env.ACT
53 |         with:
54 |           github_token: ${{ secrets.GITHUB_TOKEN }}
55 |           publish_dir: ./docs
56 | 


--------------------------------------------------------------------------------
/integration_tests/macros/drop_all_schemas.sql:
--------------------------------------------------------------------------------
 1 | {% macro get_schemas_used(schema_name) %}
 2 |     {% set schemas = [
 3 |         schema_name,
 4 |         schema_name + '_re',
 5 |         schema_name + '_re_internal',
 6 |         schema_name + '_raw',
 7 |         schema_name + '_expected',
 8 |         schema_name + '_dbt_test__audit',
 9 |         schema_name + '_seeds'
10 |     ] %}
11 |     {{ return (schemas) }}
12 | {% endmacro %}
13 | 
14 | {% macro drop_all_schemas(schema_name) %}
15 |     {% set schemas_to_drop = get_schemas_used(schema_name) %}
16 |     {{ adapter.dispatch('drop_all_schemas')(schemas_to_drop) }}
17 | {% endmacro %}
18 | 
19 | {% macro default__drop_all_schemas(schemas_to_drop) %}
20 |     {% for schema in schemas_to_drop %}
21 |         {% set relation = api.Relation.create(database=target.database, schema=schema) %}
22 |         {% do adapter.drop_schema(relation) %}
23 |     {% endfor %}
24 | {% endmacro %}
25 | 
26 | {% macro redshift__drop_all_schemas(schemas_to_drop) %}
27 |     {# 
28 |         dropping schemas with adapter.drop_schema doesn't seem to work with redshift
29 |         so we default to issuing DDL commands to redshift
30 |     #}
31 |     {% set drop_query %}
32 |         {% for schema in schemas_to_drop %}
33 |             drop schema if exists {{schema}} cascade;
34 |         {% endfor %}
35 |     {% endset %}
36 |     {% do run_query(drop_query) %}
37 | {% endmacro %}
38 | 
39 | {% macro create_required_schemas(schema_name) %}
40 |     {# required to manually create schemas used for redshift tests #}
41 |     {% set schemas_to_drop = get_schemas_used(schema_name) %}
42 |     {% set create_query %}
43 |         {% for schema in schemas_to_drop %}
44 |             create schema if not exists {{schema}};
45 |         {% endfor %}
46 |     {% endset %}
47 |     {% do run_query(create_query) %}
48 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/utils/fivetran_utils/percentile.sql:
--------------------------------------------------------------------------------
 1 | {#
 2 | # This file contains significant part of code derived from 
 3 | # https://github.com/fivetran/dbt_fivetran_utils/tree/v0.4.0 which is licensed under Apache License 2.0.
 4 | #}
 5 | 
 6 | {% macro percentile(percentile_field, partition_field, percent) -%}
 7 | 
 8 | {{ adapter.dispatch('percentile','re_data') (percentile_field, partition_field, percent) }}
 9 | 
10 | {%- endmacro %}
11 | 
12 | --percentile calculation specific to Redshift
13 | {% macro default__percentile(percentile_field, partition_field, percent)  %}
14 | 
15 |     percentile_cont( 
16 |         {{ percent }} )
17 |         within group ( order by {{ percentile_field }} )
18 |         over ( partition by {{ partition_field }} )
19 | 
20 | {% endmacro %}
21 | 
22 | --percentile calculation specific to Redshift
23 | {% macro redshift__percentile(percentile_field, partition_field, percent)  %}
24 | 
25 |     percentile_cont( 
26 |         {{ percent }} )
27 |         within group ( order by {{ percentile_field }} )
28 |         over ( partition by {{ partition_field }} )
29 | 
30 | {% endmacro %}
31 | 
32 | --percentile calculation specific to BigQuery
33 | {% macro bigquery__percentile(percentile_field, partition_field, percent)  %}
34 | 
35 |     percentile_cont( 
36 |         {{ percentile_field }}, 
37 |         {{ percent }}) 
38 |         over (partition by {{ partition_field }}    
39 |         )
40 | 
41 | {% endmacro %}
42 | 
43 | {% macro postgres__percentile(percentile_field, partition_field, percent)  %}
44 | 
45 |     percentile_cont( 
46 |         {{ percent }} )
47 |         within group ( order by {{ percentile_field }} )
48 |     /* have to group by partition field */
49 | 
50 | {% endmacro %}
51 | 
52 | {% macro spark__percentile(percentile_field, partition_field, percent)  %}
53 | 
54 |     percentile( 
55 |         {{ percentile_field }}, 
56 |         {{ percent }}) 
57 |         over (partition by {{ partition_field }}    
58 |         )
59 | 
60 | {% endmacro %}
61 | 


--------------------------------------------------------------------------------
/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | name: "re_data"
 2 | version: "0.12.0"
 3 | config-version: 2
 4 | 
 5 | require-dbt-version: [">=1.0.0", "<2.0.0"]
 6 | 
 7 | profile: "re_data_postgres"
 8 | 
 9 | target-path: "target" # directory which will store compiled SQL files
10 | clean-targets: ["target", "dbt_modules", "dbt_packages"]
11 | 
12 | on-run-end:
13 |   - "{% if var('re_data:save_test_history') %} {{ re_data.save_test_history(results) }} {% endif %}"
14 | 
15 | vars:
16 |   re_data:max_columns_in_query: 10
17 |   re_data:time_window_end: '{{ run_started_at.strftime("%Y-%m-%d 00:00:00") }}'
18 |   re_data:time_window_start: '{{ (run_started_at - modules.datetime.timedelta(1)).strftime("%Y-%m-%d 00:00:00") }}'
19 |   re_data:anomaly_detection_look_back_days: 30
20 |   re_data:select: null
21 |   re_data:re_data_anomalies_filtered: re_data_anomalies
22 | 
23 |   re_data:alerting_z_score: 3
24 | 
25 |   re_data:save_test_history: false
26 |   re_data:show_warns_as_alerts: false
27 | 
28 |   re_data:anomaly_detector:
29 |     name: modified_z_score
30 |     threshold: 3
31 | 
32 |   re_data:store_table_samples: false
33 | 
34 |   re_data:metrics_groups:
35 |     table_metrics:
36 |       table:
37 |         - row_count
38 |         - freshness
39 | 
40 |     column_metrics:
41 |       column:
42 |         numeric:
43 |           - min
44 |           - max
45 |           - avg
46 |           - stddev
47 |           - variance
48 |           - nulls_count
49 |           - nulls_percent
50 |         text:
51 |           - min_length
52 |           - max_length
53 |           - avg_length
54 |           - nulls_count
55 |           - missing_count
56 |           - nulls_percent
57 |           - missing_percent
58 |         boolean:
59 |           - count_true
60 |           - count_false
61 |           - nulls_count
62 |           - nulls_percent
63 | 
64 |   re_data:default_metrics:
65 |     - table_metrics
66 |     - column_metrics
67 | 
68 | models:
69 |   re_data:
70 |     +schema: re
71 |     internal:
72 |       +schema: re_internal
73 | 


--------------------------------------------------------------------------------
/macros/utils/mock/empty_tables.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro all_types_select() %}
 3 |     with types_table as (
 4 |         select
 5 |             cast (null as {{ string_type() }}) as string_type,
 6 |             cast (null as {{ long_string_type() }}) as long_string_type,
 7 |             cast (1 as {{ numeric_type() }}) as numeric_type,
 8 |             cast ('2000-01-10' as {{ timestamp_type() }}) as timestamp_type,
 9 |             cast (true as {{ boolean_type() }}) as boolean_type
10 |     )
11 | {% endmacro %}
12 | 
13 | {% macro empty_table_generic(list) %}
14 |     {{ re_data.all_types_select() }}
15 |     select
16 |     {% for name, type in list %}
17 |          {{ type }}_type as {{ name }}
18 |         {%- if not loop.last %}, {%- endif %}
19 |     {% endfor %}
20 |     from types_table
21 |     where string_type is not null
22 | {% endmacro %}
23 | 
24 | {% macro empty_last_base_metrics() %}
25 |     {{
26 |         re_data.empty_table_generic([
27 |             ('table_name', 'string'),
28 |             ('column_name', 'string'),
29 |             ('metric', 'string'),
30 |             ('value', 'numeric')
31 |         ])
32 |     }}
33 | {% endmacro %}
34 | 
35 | {% macro empty_last_table_samples() %}
36 |     {{
37 |         re_data.empty_table_generic([
38 |             ('table_name', 'string'),
39 |             ('sample_data', 'string')
40 |         ])
41 |     }}
42 | {% endmacro %}
43 | 
44 | {% macro empty_columns_table() %}
45 |     {{
46 |         re_data.empty_table_generic([
47 |             ('name', 'string'),
48 |             ('schema', 'string'),
49 |             ('database', 'string'),
50 |             ('column_name', 'string'),
51 |             ('data_type', 'string'),
52 |             ('is_nullable', 'boolean'),
53 |             ('time_filter', 'string'),
54 |             ('computed_on', 'timestamp')
55 |         ])
56 |     }}
57 | {% endmacro %}
58 | 
59 | 
60 | {% macro empty_table() %}
61 |     {{
62 |         re_data.empty_table_generic([
63 |             ('name', 'string')
64 |         ])
65 |     }}
66 | {% endmacro %}
67 | 
68 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/expected_test_history.csv:
--------------------------------------------------------------------------------
 1 | table_name,column_name,test_name,status,message,failures_count,severity
 2 | TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR
 3 | TEST_RE_DATA_TABLE_SAMPLES,---,_table_samples_,Pass,---,0,ERROR
 4 | TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR
 5 | TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR
 6 | BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR
 7 | BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR
 8 | BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR
 9 | BUY_EVENTS,VALUE2,5_02___min__200,Pass,---,0,ERROR
10 | BUY_EVENTS,VALUE1,alue1__min__107,Pass,---,0,ERROR
11 | BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR
12 | BUY_EVENTS,VALUE1,5_02___min__100,Pass,---,0,ERROR
13 | BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR
14 | RE_DATA_METRICS,---,_buy_events___3,Pass,---,0,ERROR
15 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
16 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
17 | BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR
18 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
19 | TEST_RE_DATA_TABLE_SAMPLES,---,_table_samples_,Pass,---,0,ERROR
20 | TEST_RE_DATA_ANOMALIES,---,cted_anomalies_,Pass,---,0,ERROR
21 | BUY_EVENTS,---,ts_row_count__1,Pass,---,0,ERROR
22 | BUY_EVENTS,---,null__freshness,Pass,---,0,ERROR
23 | BUY_EVENTS,VALUE2,5_02___min__200,Pass,---,0,ERROR
24 | BUY_EVENTS,---,_table_rows__10,Pass,---,0,ERROR
25 | BUY_EVENTS,VALUE1,ulls_percent__0,Pass,---,0,ERROR
26 | BUY_EVENTS,---,0__row_count__0,Pass,---,0,ERROR
27 | BUY_EVENTS,VALUE1,5_02___min__100,Pass,---,0,ERROR
28 | BUY_EVENTS,VALUE1,alue1__min__107,Pass,---,0,ERROR
29 | BUY_EVENTS,VALUE1,__nulls_percent,Pass,---,0,ERROR
30 | RE_DATA_METRICS,---,_buy_events___3,Pass,---,0,ERROR
31 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
32 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
33 | RE_DATA_METRICS,---,ref_buy_events_,Pass,---,0,ERROR
34 | TEST_RE_DATA_METRICS,---,pected_metrics_,Pass,---,0,ERROR
35 | TEST_RE_DATA_Z_SCORE,---,pected_z_score_,Pass,---,0,ERROR
36 | 


--------------------------------------------------------------------------------
/macros/store/insert_list_to_table.sql:
--------------------------------------------------------------------------------
 1 | {% macro insert_list_to_table(table, list, params, dtype=None,insert_size=100) %}
 2 | 
 3 |     {% set single_insert_list = [] %}
 4 |     {% for el in list %}
 5 |         {% do single_insert_list.append(el) %}
 6 |         {% set single_insert_list_size = single_insert_list | length %}
 7 |         {% if single_insert_list_size == insert_size or loop.last %}
 8 | 
 9 |             {% set insert_query %}
10 |                 insert into {{ table }} ({%- for p in params %}{{p}}{% if not loop.last %}, {% endif %}{% endfor %}) values
11 |                 {%- for row in single_insert_list -%}
12 |                     (
13 |                     {%- for p in params -%}
14 |                         {%- if row[p] is none -%}
15 |                             NULL
16 |                         {%- else -%}
17 |                             {%- if row[p] is string -%}
18 |                                 {%- if dtype and p in dtype -%}
19 |                                   {% set cast_type = dtype[p] %}
20 |                                   cast ({{ re_data.quote_string(row[p]) }} as {{ cast_type }})
21 |                                 {%- else %}
22 |                                   {{- re_data.quote_string(row[p]) -}}
23 |                                 {%- endif -%}
24 |                             {%- elif row[p] is number -%}
25 |                                 {{-row[p]-}}
26 |                             {%- else -%}
27 |                                 {{- re_data.quote_string(tojson(row[p])) -}}
28 |                             {%- endif -%}
29 |                         {%- endif -%}
30 |                         {%- if not loop.last -%},{%- endif -%}
31 |                     {%- endfor -%}
32 |                     )
33 |                     {%- if not loop.last -%},{%- endif %}
34 |                 {% endfor -%}
35 |             {% endset %}
36 | 
37 |             {% do run_query(insert_query) %}
38 |             {% do single_insert_list.clear() %}
39 |         {% endif %}
40 |     {% endfor %}
41 | 
42 | {% endmacro %}
43 | 


--------------------------------------------------------------------------------
/macros/config/get_model_config.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro get_model_config(monitored) %}
 3 |     {% set model = {} %}
 4 |     {% do model.update({'name': re_data.row_value(monitored, 'name')}) %}
 5 |     {% do model.update({'schema': re_data.row_value(monitored, 'schema')}) %}
 6 |     {% do model.update({'database': re_data.row_value(monitored, 'database')}) %}
 7 |     {% do model.update({'time_filter': re_data.row_value(monitored, 'time_filter')}) %}    
 8 |     {% do model.update({'metrics': fromjson(re_data.row_value(monitored, 'metrics'))}) %}
 9 |     {% do model.update({'model_name': model.get('database') + '.' + model.get('schema') + '.' + model.get('name')}) %}
10 |     {% do model.update({'table_name': full_table_name_values(model.get('name'), model.get('schema'), model.get('database'))}) %}
11 | 
12 |     {% set columns_db = re_data.row_value(monitored, 'columns') %}
13 | 
14 |     {% set column_list = fromjson(columns_db) if columns_db is not none else none %}
15 |     {% set columns_dict = re_data.dict_from_list(column_list) %}
16 | 
17 |     {% do model.update({'columns_dict': columns_dict}) %}
18 |     {% do model.update({'columns_compute_all': columns_dict is none}) %}
19 | 
20 |     {% set columns_query %}
21 |         select * from {{ ref('re_data_columns') }}
22 |         where name = '{{ model.name }}' and schema = '{{ model.schema }}' and database = '{{ model.database }}'
23 |     {% endset %}
24 |     {% set columns = run_query(columns_query) %}
25 | 
26 |     {% set columns_info = {} %}
27 |     {% for col in columns %}
28 |         {% set column_name = re_data.row_value(col, 'column_name') %}
29 |         {% set data_type = re_data.get_column_type(col) %}
30 |         {% do columns_info.update({column_name: { 'data_type': data_type }}) %}
31 |     {% endfor %}
32 | 
33 |     {% do model.update({'columns_info': columns_info}) %}
34 |     {% do model.update({'columns': columns}) %}
35 | 
36 |     {{ return(model) }}
37 | {% endmacro %}
38 | 
39 | {% macro should_compute_metric(model, column_name) %}
40 |     {{ return(model.columns_compute_all or model.columns_dict.get(column_name)) }}
41 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/samples/internal_model_template.sql:
--------------------------------------------------------------------------------
 1 | {%- macro order_by_if_time_filter(time_filter) -%}
 2 |     {%- if time_filter is not none -%}
 3 |         order by {{ time_filter }} desc
 4 |     {%- endif -%}
 5 | {%- endmacro -%}
 6 | 
 7 | 
 8 | {% macro re_data_last_table_samples() %}
 9 |     {{ re_data.generate_depends(['re_data_selected', 're_data_monitored', 're_data_columns', 're_data_run_started_at', 're_data_last_table_samples_part']) }}
10 | 
11 |     {{
12 |         config(
13 |             materialized='table',
14 |         )
15 |     }}
16 | 
17 |     {% if var.has_var('re_data:store_table_samples') %}
18 |         {% set store_samples = var('re_data:store_table_samples') %}
19 |     {% endif %}
20 |     {% if not re_data.in_compile() and store_samples is sameas true %}
21 |         {%- set tables = run_query(re_data.get_tables()) %}
22 | 
23 |         {% set samples_list = [] %}
24 |         {%- for sample_table in tables %}
25 | 
26 |             {% set model = get_model_config(sample_table) %}
27 |             {% set columns_to_sample = [] %}
28 |             {% for key, value in model.columns_info.items() | sort %}
29 |                 {% if value.data_type in ['numeric', 'text'] %}
30 |                     {% do columns_to_sample.append(key) %}
31 |                 {% endif %}
32 |             {% endfor %}
33 | 
34 |             {% set samples_query %}
35 |                 select {{ print_list(columns_to_sample)}} from {{ model.table_name }}
36 |                 {{ order_by_if_time_filter(model.time_filter) }}
37 |                 limit 10
38 |             {% endset %}
39 | 
40 |             {% set samples = re_data.agate_to_list(run_query(samples_query)) %}
41 |             {% do samples_list.append({
42 |                 'table_name': model.model_name,
43 |                 'sample_data': samples,
44 |             }) %}
45 | 
46 |         {% endfor %}
47 |         {% do re_data.insert_list_to_table(
48 |                 ref('re_data_last_table_samples_part'),
49 |                 samples_list,
50 |                 ['table_name', 'sample_data']
51 |             ) %}
52 |     {% endif %}
53 | 
54 |     {{ re_data.empty_last_table_samples() }}
55 | 
56 | {% endmacro %}


--------------------------------------------------------------------------------
/profiles.yml:
--------------------------------------------------------------------------------
 1 | re_data_postgres:
 2 |   target: dev
 3 |   outputs:
 4 |     dev:
 5 |       type: postgres
 6 |       host: localhost
 7 |       user: postgres
 8 |       password: postgres
 9 |       port: 5432
10 |       dbname: postgres
11 |       schema: dq
12 |       threads: 4
13 | re_data_snowflake:
14 |   target: dev
15 |   outputs:
16 |     dev:
17 |       type: snowflake
18 |       account: "{{ env_var('SNOWFLAKE_RE_DATA_TESTING_ACCOUNT') }}"
19 |       user: "{{ env_var('RE_DATA_TESTING_USER') }}"
20 |       password: "{{ env_var('RE_DATA_TESTING_PASSWORD') }}"
21 |       database: RE_DATA_TESTING
22 |       warehouse: RE_DATA_TESTING_DWH
23 |       schema: "{{ env_var('DQ_SCHEMA') }}"
24 |       threads: 4
25 | re_data_redshift:
26 |   target: dev
27 |   outputs:
28 |     dev:
29 |       type: redshift
30 |       host: "{{ env_var('REDSHIFT_RE_DATA_TESTING_HOST') }}"
31 |       user: "{{ env_var('RE_DATA_TESTING_USER') }}"
32 |       password: "{{ env_var('RE_DATA_TESTING_PASSWORD') }}"
33 |       port: 5439
34 |       dbname: re_data_testing
35 |       schema: "{{ env_var('DQ_SCHEMA') }}"
36 |       threads: 4
37 | re_data_bigquery:
38 |   target: dev
39 |   outputs:
40 |     dev:
41 |       type: bigquery
42 |       method: service-account-json
43 |       project: "{{ env_var('BIGQUERY_TESTING_PROJECT_ID') }}"
44 |       dataset: "{{ env_var('DQ_SCHEMA') }}"
45 |       threads: 4
46 |       keyfile_json:
47 |         type: "{{ env_var('BIGQUERY_TESTING_TYPE') }}"
48 |         project_id: "{{ env_var('BIGQUERY_TESTING_PROJECT_ID') }}"
49 |         private_key_id: "{{ env_var('BIGQUERY_TESTING_PRIVATE_KEY_ID') }}"
50 |         private_key: "{{ env_var('BIGQUERY_TESTING_PRIVATE_KEY') }}"
51 |         client_email: "{{ env_var('BIGQUERY_TESTING_CLIENT_EMAIL') }}"
52 |         client_id: "{{ env_var('BIGQUERY_TESTING_CLIENT_ID') }}"
53 |         auth_uri: "{{ env_var('BIGQUERY_TESTING_AUTH_URI') }}"
54 |         token_uri: "{{ env_var('BIGQUERY_TESTING_TOKEN_URI') }}"
55 |         auth_provider_x509_cert_url: "{{ env_var('BIGQUERY_TESTING_AUTH_PROVIDER_X509_CERT_URL') }}"
56 |         client_x509_cert_url: "{{ env_var('BIGQUERY_TESTING_CLIENT_X509_CERT_URL') }}"
57 |       location: US
58 |       timeout_seconds: 300
59 |       priority: interactive
60 |       retries: 1


--------------------------------------------------------------------------------
/models/metrics/types/base/re_data_base_metrics.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key = 'id',
 5 |         on_schema_change='sync_all_columns',
 6 |     )
 7 | }}
 8 | 
 9 | -- depends_on: {{ ref('re_data_columns') }}
10 | -- depends_on: {{ ref('re_data_last_base_metrics_thread0') }}
11 | -- depends_on: {{ ref('re_data_last_base_metrics_thread1') }}
12 | -- depends_on: {{ ref('re_data_last_base_metrics_thread2') }}
13 | -- depends_on: {{ ref('re_data_last_base_metrics_thread3') }}
14 | -- depends_on: {{ ref('re_data_last_base_metrics_part0') }}
15 | -- depends_on: {{ ref('re_data_last_base_metrics_part1') }}
16 | -- depends_on: {{ ref('re_data_last_base_metrics_part2') }}
17 | -- depends_on: {{ ref('re_data_last_base_metrics_part3') }}
18 | -- depends_on: {{ ref('re_data_run_started_at') }}
19 | -- depends_on: {{ ref('re_data_monitored') }}
20 | -- depends_on: {{ ref('re_data_selected') }}
21 | 
22 | with 
23 | 
24 | with_time_window as (
25 |     {% set parts = ['0','1','2','3'] %}
26 |     {% for part in parts %}
27 |         {% set ref_name = 're_data_last_base_metrics_part' + part %}
28 |         select
29 |             *,
30 |             {{ time_window_start() }} as time_window_start,
31 |             {{ time_window_end() }} as time_window_end
32 |         from {{ ref(ref_name) }}
33 |         {%- if not loop.last %} union all {%- endif %}
34 |     {% endfor %}
35 | )
36 | select
37 |     cast ({{ dbt_utils.generate_surrogate_key([
38 |         'table_name',
39 |         'column_name',
40 |         'metric',
41 |         'time_window_start',
42 |         'time_window_end'
43 |     ]) }} as {{ string_type() }} ) as id,
44 |     cast (table_name as {{ string_type() }} ) as table_name,
45 |     cast (column_name as {{ string_type() }} ) as column_name,
46 |     cast (metric as {{ string_type() }} ) as metric,
47 |     cast (value as {{ numeric_type() }} ) as value,
48 |     cast (time_window_start as {{ timestamp_type() }} ) as time_window_start,
49 |     cast (time_window_end as {{ timestamp_type() }} ) as time_window_end,
50 |     cast (
51 |         {{ interval_length_sec('time_window_start', 'time_window_end') }} as {{ integer_type() }}
52 |     ) as interval_length_sec,
53 |     cast ({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as computed_on
54 | from with_time_window
55 | 


--------------------------------------------------------------------------------
/integration_tests/models/public_macros/normalizing/us_states_normalized.sql:
--------------------------------------------------------------------------------
 1 | with us_states_normalization_cte as (
 2 |     select source, target from {{ ref('us_states_normalization') }}
 3 | )
 4 | 
 5 | {% set us_states_mapping = {'Ala.': 'Alabama', 'Alaska': 'Alaska', 'Ariz.': 'Arizona', 'Ark.': 'Arkansas', 'Calif.': 'California', 'Colo.': 'Colorado', 'Conn.': 'Connecticut',
 6 |     'Del.': 'Delaware', 'D.C.': 'District of Columbia', 'Fla.': 'Florida', 'Ga.': 'Georgia', 'Hawaii': 'Hawaii', 'Idaho': 'Idaho', 'Ill.': 'Illinois', 'Ind.': 'Indiana',
 7 |     'Iowa': 'Iowa', 'Kans.': 'Kansas', 'Ky.': 'Kentucky', 'La.': 'Louisiana', 'Maine': 'Maine', 'Md.': 'Maryland', 'Mass.': 'Massachusetts', 'Mich.': 'Michigan',
 8 |     'Minn.': 'Minnesota', 'Miss.': 'Mississippi', 'Mo.': 'Missouri', 'Mont.': 'Montana', 'Nebr.': 'Nebraska', 'Nev.': 'Nevada', 'N.H.': 'New Hampshire', 'N.J.': 'New Jersey',
 9 |     'N.M.': 'New Mexico', 'N.Y.': 'New York', 'N.C.': 'North Carolina', 'N.D.': 'North Dakota', 'Ohio': 'Ohio', 'Okla.': 'Oklahoma', 'Ore.': 'Oregon', 'Pa.': 'Pennsylvania',
10 |     'R.I.': 'Rhode Island', 'S.C.': 'South Carolina', 'S.D.': 'South Dakota', 'Tenn.': 'Tennessee', 'Tex.': 'Texas', 'Utah': 'Utah', 'Vt.': 'Vermont', 'Va.': 'Virginia',
11 |     'Wash.': 'Washington', 'W.Va.': 'West Virginia', 'Wis.': 'Wisconsin', 'Wyo.': 'Wyoming'}
12 | %}
13 | 
14 | 
15 | {# 
16 |     We have three ways of passing the source used for normalization
17 |         1. passing a dbt model using ref('') which is of type Relation.
18 |         2. passing a common table expression that contains the source mapping
19 |             Note: model or cte must include "source" and "target" column names used for normalization in 1. & 2. repectively
20 |         3. passing a dictionary of values that map from source -> target ie {[source]: [target]}
21 |  #}
22 | 
23 | select distinct * from (
24 |     select state, code, state__normalized from {{ re_data.normalize_values(ref('abbreviated_us_states'), 'state', ref('us_states_normalization')) }} s
25 |     union all
26 |     select state, code, state__normalized from {{ re_data.normalize_values(ref('abbreviated_us_states'), 'state', 'us_states_normalization_cte') }} s
27 |     union all
28 |     select state, code, state__normalized from {{ re_data.normalize_values(ref('abbreviated_us_states'), 'state', us_states_mapping) }} s
29 | ) as normalized


--------------------------------------------------------------------------------
/macros/utils/used_types.sql:
--------------------------------------------------------------------------------
  1 | {% macro timestamp_type() %}
  2 |     {{ adapter.dispatch('timestamp_type', 're_data')() }}
  3 | {% endmacro %}
  4 | 
  5 | {% macro default__timestamp_type() %}
  6 |     timestamp without time zone
  7 | {% endmacro %}
  8 | 
  9 | {% macro redshift__timestamp_type() %}
 10 |     TIMESTAMP
 11 | {% endmacro %}
 12 | 
 13 | {% macro bigquery__timestamp_type() %}
 14 |     TIMESTAMP
 15 | {% endmacro %}
 16 | 
 17 | {% macro snowflake__timestamp_type() %}
 18 |     TIMESTAMP_NTZ
 19 | {% endmacro %}
 20 | 
 21 | {% macro string_type() %}
 22 |     {{ adapter.dispatch('string_type', 're_data')() }}
 23 | {% endmacro %}
 24 | 
 25 | {% macro default__string_type() %}
 26 |     text
 27 | {% endmacro %}
 28 | 
 29 | {% macro redshift__string_type() %}
 30 |     varchar(2047)
 31 | {% endmacro %}
 32 | 
 33 | {% macro bigquery__string_type() %}
 34 |     STRING
 35 | {% endmacro %}
 36 | 
 37 | {% macro snowflake__string_type() %}
 38 |     STRING
 39 | {% endmacro %}
 40 | 
 41 | {% macro long_string_type() %}
 42 |     {{ adapter.dispatch('long_string_type', 're_data')() }}
 43 | {% endmacro %}
 44 | 
 45 | {% macro default__long_string_type() %}
 46 |     {{ re_data.string_type() }}
 47 | {% endmacro %}
 48 | 
 49 | {% macro redshift__long_string_type() %}
 50 |     varchar(65535)
 51 | {% endmacro %}
 52 | 
 53 | {% macro integer_type() %}
 54 |     INTEGER
 55 | {% endmacro %}
 56 | 
 57 | 
 58 | {% macro boolean_type() %}
 59 |     {{ adapter.dispatch('boolean_type', 're_data')() }}
 60 | {% endmacro %}
 61 | 
 62 | {% macro default__boolean_type() %}
 63 |     BOOLEAN
 64 | {% endmacro %}
 65 | 
 66 | {% macro redshift__boolean_type() %}
 67 |     boolean
 68 | {% endmacro %}
 69 | 
 70 | {% macro bigquery__boolean_type() %}
 71 |     BOOLEAN
 72 | {% endmacro %}
 73 | 
 74 | {% macro snowflake__boolean_type() %}
 75 |     BOOLEAN
 76 | {% endmacro %}
 77 | 
 78 | 
 79 | {% macro numeric_type() %}
 80 |     {{ adapter.dispatch('numeric_type', 're_data')() }}
 81 | {% endmacro %}
 82 | 
 83 | {% macro default__numeric_type() %}
 84 |     double precision
 85 | {% endmacro %}
 86 | 
 87 | {% macro redshift__numeric_type() %}
 88 |     DOUBLE PRECISION
 89 | {% endmacro %}
 90 | 
 91 | {% macro bigquery__numeric_type() %}
 92 |     FLOAT64
 93 | {% endmacro %}
 94 | 
 95 | {% macro snowflake__numeric_type() %}
 96 |     FLOAT
 97 | {% endmacro %}
 98 | 
 99 | 
100 | 
101 | 


--------------------------------------------------------------------------------
/macros/utils/column_types.sql:
--------------------------------------------------------------------------------
  1 | 
  2 | {% macro get_column_type(column) %}
  3 |     {% set result = adapter.dispatch('get_column_type', 're_data')(column) %}
  4 |     {{ return(result) }}
  5 | {% endmacro %}
  6 | 
  7 | 
  8 | {% macro default__get_column_type(column) %}
  9 |     
 10 |     {% if column.data_type in [
 11 |         'character varying',
 12 |         'varchar',
 13 |         'character',
 14 |         'char',
 15 |         'text'
 16 |     ] %}
 17 |         {{ return('text') }}
 18 | 
 19 |     {% elif column.data_type in [
 20 |             'smallint',
 21 |             'integer',
 22 |             'bigint',
 23 |             'decimal',
 24 |             'numeric',
 25 |             'real',
 26 |             'double precision',
 27 |             'enum',
 28 |         ] %}
 29 |         {{ return('numeric') }}
 30 |     
 31 |     {% elif column.data_type in [ 'boolean', 'bool' ] %}
 32 |     
 33 |         {{ return('boolean') }}
 34 | 
 35 |     {% else %}
 36 |         {{ return('unknown') }}
 37 | 
 38 |     {% endif %}
 39 | 
 40 | {% endmacro %}
 41 | 
 42 | 
 43 | {% macro snowflake__get_column_type(column) %}
 44 | 
 45 |     {% if column.DATA_TYPE in [
 46 |         'VARCHAR',
 47 |         'CHAR',
 48 |         'CHARACTER',
 49 |         'STRING',
 50 |         'TEXT'
 51 |     ] %}
 52 | 
 53 |         {{ return('text') }}
 54 | 
 55 |     {% elif column.DATA_TYPE in [
 56 |             'NUMBER',
 57 |             'DECIMAL',
 58 |             'NUMERIC',
 59 |             'INT',
 60 |             'INTEGER',
 61 |             'BIGINT',
 62 |             'SMALLINT',
 63 |             'TINYINT',
 64 |             'BYTEINT',
 65 |             'FLOAT',
 66 |             'FLOAT4',
 67 |             'FLOAT8',
 68 |             'DOUBLE',
 69 |             'DOUBLE PRECISION',
 70 |             'REAL',
 71 |     ] %}
 72 | 
 73 |         {{ return('numeric') }}
 74 | 
 75 |     {% elif column.data_type in [ 'BOOLEAN' ] %}
 76 |     
 77 |         {{ return('boolean') }}
 78 | 
 79 |     {% else %}
 80 | 
 81 |         {{ return('unknown') }}
 82 | 
 83 |     {% endif %}
 84 | 
 85 | {% endmacro %}
 86 | 
 87 | 
 88 | {% macro bigquery__get_column_type(column) %}
 89 | 
 90 |     {% if column.data_type in [ 'STRING' ] %}
 91 |         {{ return('text') }}
 92 | 
 93 |         {% elif column.data_type in [ "INT64", "NUMERIC", "BIGNUMERIC", "FLOAT64", "INTEGER"] %}
 94 |         {{ return('numeric') }}
 95 | 
 96 |         {% elif column.data_type in [ "BOOLEAN", "BOOL"] %}
 97 |         {{ return('boolean') }}
 98 |     {% else %}
 99 |         {{ return('unknown') }}
100 | 
101 |     {% endif %}
102 | {% endmacro %}
103 | 


--------------------------------------------------------------------------------
/macros/metrics/base/build_in/column_default.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro re_data_metric_max(context) %}
 3 |     max({{context.column_name}})
 4 | {% endmacro %}
 5 | 
 6 | {% macro re_data_metric_min(context) %}
 7 |     min({{context.column_name}})
 8 | {% endmacro %}
 9 | 
10 | {% macro re_data_metric_avg(context) %}
11 |     avg(cast ({{context.column_name}} as {{ numeric_type() }}))
12 | {% endmacro %}
13 | 
14 | {% macro re_data_metric_stddev(context) %}
15 |     stddev(cast ( {{context.column_name}} as {{ numeric_type() }}))
16 | {% endmacro %}
17 | 
18 | {% macro re_data_metric_variance(context) %}
19 |     variance(cast ( {{context.column_name}} as {{ numeric_type() }}))
20 | {% endmacro %}
21 | 
22 | {% macro re_data_metric_max_length(context) %}
23 |     max(length({{context.column_name}}))
24 | {% endmacro %}
25 | 
26 | {% macro re_data_metric_min_length(context) %}
27 |     min(length({{context.column_name}}))
28 | {% endmacro %}
29 | 
30 | {% macro re_data_metric_avg_length(context) %}
31 |     avg(cast (length( {{context.column_name}} ) as {{ numeric_type() }}))
32 | {% endmacro %}
33 | 
34 | {% macro re_data_metric_nulls_count(context) %}
35 |     coalesce(
36 |         sum(
37 |             case when {{context.column_name}} is null
38 |                 then 1
39 |             else 0
40 |             end
41 |         ), 0
42 |     )
43 | {% endmacro %}
44 | 
45 | {% macro re_data_metric_missing_count(context) %}
46 |     coalesce(
47 |         sum(
48 |             case 
49 |             when {{context.column_name}} is null
50 |                 then 1
51 |             when {{context.column_name}} = ''
52 |                 then 1
53 |             else 0
54 |             end
55 |         ), 0
56 |     )
57 | {% endmacro %}
58 | 
59 | {% macro re_data_metric_nulls_percent(context) %}
60 |     {{ percentage_formula(re_data_metric_nulls_count(context), re_data_metric_row_count()) }}
61 | {% endmacro %}
62 | 
63 | {% macro re_data_metric_missing_percent(context) %}
64 |     {{ percentage_formula(re_data_metric_missing_count(context), re_data_metric_row_count()) }}
65 | {% endmacro %}
66 | 
67 | {% macro re_data_metric_count_true(context) %}
68 |     COALESCE(
69 |         SUM(
70 |             CASE
71 |                 WHEN {{ context.column_name }} IS TRUE THEN 1
72 |                 ELSE 0
73 |             END
74 |         ),
75 |         0
76 |     )
77 | {% endmacro %}
78 | 
79 | {% macro re_data_metric_count_false(context) %}
80 |     COALESCE(
81 |         SUM(
82 |             CASE
83 |                 WHEN {{ context.column_name }} IS FALSE THEN 1
84 |                 ELSE 0
85 |             END
86 |         ),
87 |         0
88 |     )
89 | {% endmacro %}
90 | 
91 | 
92 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/expected_anomalies.csv:
--------------------------------------------------------------------------------
 1 | table_name,column_name,metric,anomaly_detector,interval_length_sec
 2 | BUY_EVENTS,VALUE2,min,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 3 | BUY_EVENTS,VALUE2,avg,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 4 | BUY_EVENTS,---,freshness,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 5 | BUY_EVENTS,VALUE1,min,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 6 | BUY_EVENTS,VALUE1,max,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 7 | BUY_EVENTS,VALUE1,avg,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 8 | BUY_EVENTS,VALUE2,max,"{""name"": ""z_score"", ""threshold"": 0.5}",86400
 9 | SAMPLE_TABLE,VALUE1,avg,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
10 | SAMPLE_TABLE,EVENT_TYPE,unique_rows,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
11 | SAMPLE_TABLE,EVENT_TYPE,duplicate_rows,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
12 | SAMPLE_TABLE,VALUE1,max,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
13 | SAMPLE_TABLE,EVENT_TYPE,match_regex_percent,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
14 | SAMPLE_TABLE,VALUE1,diff,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
15 | SAMPLE_TABLE,VALUE1,stddev,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
16 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
17 | SAMPLE_TABLE,EVENT_TYPE,avg_length,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
18 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex_percent,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
19 | SAMPLE_TABLE,EVENT_TYPE,match_regex,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
20 | SAMPLE_TABLE,VALUE1,variance,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
21 | SAMPLE_TABLE,EVENT_TYPE,max_length,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
22 | SAMPLE_TABLE,EVENT_TYPE,distinct_values,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
23 | SAMPLE_WITH_ANOMALY,---,freshness,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
24 | SAMPLE_WITH_ANOMALY,VALUE2,avg,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
25 | SAMPLE_WITH_ANOMALY,VALUE1,max,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
26 | SAMPLE_WITH_ANOMALY,VALUE2,min,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
27 | SAMPLE_WITH_ANOMALY,VALUE1,avg,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
28 | SAMPLE_WITH_ANOMALY,VALUE2,max,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
29 | SAMPLE_WITH_ANOMALY,VALUE1,min,"{""name"": ""modified_z_score"", ""threshold"": 0.6}",86400
30 | 


--------------------------------------------------------------------------------
/macros/utils/generate_alert_message.sql:
--------------------------------------------------------------------------------
 1 | {% macro generate_anomaly_message(column_name, metric, last_value, last_avg) -%}
 2 |   {{ return(adapter.dispatch('generate_anomaly_message', 're_data')(column_name, metric, last_value, last_avg)) }}
 3 | {%- endmacro %}
 4 | 
 5 | {% macro default__generate_anomaly_message(column_name, metric, last_value, last_avg) %}
 6 | 
 7 |     case when {{ column_name }} != '' then metric || '(' || column_name || ')'
 8 |     else metric
 9 |     end 
10 |     || ' is ' ||
11 |     {{ to_2dp( percentage_formula('last_value - last_avg', last_avg) ) }}
12 |     || '% ' ||
13 |     {{ comparison_text(last_value, last_avg) }}
14 |     || ' average.'
15 | {% endmacro %}
16 | 
17 | {% macro to_2dp(val) %}
18 |     {{ adapter.dispatch('to_2dp', 're_data')(val) }}
19 | {% endmacro %}
20 | 
21 | {% macro default__to_2dp(val) %}
22 |     trim(to_char({{ val }}, '9999999999999999990D00'))
23 | {% endmacro %}
24 | 
25 | {% macro bigquery__to_2dp(val) %}
26 |     format('%.2f', {{ val }})
27 | {% endmacro %}
28 | 
29 | {% macro seconds_to_hours(val) %}
30 |     cast({{ val }} as {{ numeric_type() }}) / 3600
31 | {% endmacro %}
32 | 
33 | {% macro generate_metric_value_text(metric, value) %}
34 |     case 
35 |         when {{ metric }} = 'freshness' 
36 |             then cast({{ to_2dp(seconds_to_hours(value)) }} as {{ string_type() }}) || ' hours'
37 |         when {{ regex_match_expression(metric, 'percent') }} 
38 |             then cast({{ to_2dp(value) }} as {{ string_type() }}) || '%'
39 |         when {{ regex_match_expression(metric, 'count') }} 
40 |             then cast({{ value }} as {{ string_type() }})
41 |         else cast({{ to_2dp(value) }} as {{ string_type() }})
42 |     end
43 | 
44 | {% endmacro %}
45 | 
46 | {% macro generate_schema_change_message(operation, column_name, prev_column_name, prev_data_type, data_type, detected_time) %}
47 |     case 
48 |         when {{ operation }} = 'column_added'
49 |             then 'column ' || {{ column_name }} || ' of type ' || {{ data_type }} || ' was added.'
50 |         when {{ operation }} = 'column_removed'
51 |             then 'column ' || {{ prev_column_name }} || ' of type ' || {{ prev_data_type }} || ' was removed.'
52 |         when {{ operation }} = 'type_change'
53 |             then {{ column_name }} || ' column data type was changed from ' || {{ prev_data_type }} || ' to ' || {{ data_type }} || '.'
54 |         else ''
55 |     end
56 | {% endmacro %}
57 | 
58 | {% macro generate_failed_test_message(test_name, column_name) %}
59 |     case 
60 |         when {{ column_name }} is null
61 |             then 'Test ' || {{ test_name }} || ' failed.'
62 |         else
63 |             'Test ' || {{ test_name }} || ' failed for column ' || {{ column_name }} || '.'
64 |     end
65 | {% endmacro %}
66 | 


--------------------------------------------------------------------------------
/models/alerts/re_data_z_score.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='incremental',
 4 |         unique_key = 'id',
 5 |         on_schema_change='sync_all_columns',
 6 |     )
 7 | }}
 8 | 
 9 | with z_score_without_id as (
10 | 
11 |     select
12 |         stats.table_name as table_name,
13 |         stats.column_name as column_name,
14 |         stats.metric as metric,
15 |         stats.interval_length_sec,
16 |         (last_metric.last_value - stats.last_avg) / (stats.last_stddev + 0.0000000001) as z_score_value,
17 |         case
18 |             when stats.last_median_absolute_deviation = 0 then
19 |                 (last_metric.last_value - stats.last_median) / (1.253314 * (stats.last_mean_absolute_deviation + 0.0000000001))
20 |             else
21 |                 (0.6745 * (last_metric.last_value - stats.last_median)) / (stats.last_median_absolute_deviation + 0.0000000001) 
22 |         end as modified_z_score_value,
23 |         last_metric.last_value as last_value,
24 |         stats.last_avg as last_avg,
25 |         stats.last_median as last_median,
26 |         stats.last_stddev as last_stddev,
27 |         stats.last_median_absolute_deviation,
28 |         stats.last_mean_absolute_deviation,
29 |         stats.last_third_quartile - stats.last_first_quartile as last_iqr,
30 |         stats.last_first_quartile,
31 |         stats.last_third_quartile,
32 |         {{ time_window_end() }} as time_window_end,
33 |         cast( {{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }} ) as computed_on
34 |     from
35 |         {{ ref('re_data_last_stats') }} as stats,
36 |         {{ ref('re_data_last_metrics') }} as last_metric
37 |     where
38 |         stats.table_name = last_metric.table_name and
39 |         stats.column_name = last_metric.column_name and
40 |         stats.metric = last_metric.metric and
41 |         (
42 |             stats.interval_length_sec = last_metric.interval_length_sec or
43 |             (stats.interval_length_sec is null and last_metric.interval_length_sec is null)
44 |         ) and
45 |         last_metric.last_value is not null and
46 |         stats.last_avg is not null and
47 |         stats.last_stddev is not null
48 |     )
49 | 
50 | select
51 |     cast ({{ dbt_utils.generate_surrogate_key([
52 |       'table_name',
53 |       'column_name',
54 |       'metric',
55 |       'interval_length_sec',
56 |       'time_window_end'
57 |     ]) }} as {{ string_type() }} ) as id,
58 |     table_name,
59 |     column_name,
60 |     metric,
61 |     z_score_value,
62 |     modified_z_score_value,
63 |     last_value,
64 |     last_avg,
65 |     last_median,
66 |     last_stddev,
67 |     last_median_absolute_deviation,
68 |     last_mean_absolute_deviation,
69 |     last_iqr,
70 |     last_first_quartile,
71 |     last_third_quartile,
72 |     time_window_end,
73 |     interval_length_sec,
74 |     computed_on
75 | 
76 | from z_score_without_id
77 | 


--------------------------------------------------------------------------------
/models/alerts/re_data_anomalies.sql:
--------------------------------------------------------------------------------
 1 | {{
 2 |     config(
 3 |         materialized='view'
 4 |     )
 5 | }}
 6 | select
 7 |     z.id,
 8 |     z.table_name,
 9 |     z.column_name,
10 |     z.metric,
11 |     z.z_score_value,
12 |     z.modified_z_score_value,
13 |     m.anomaly_detector,
14 |     z.last_value,
15 |     z.last_avg,
16 |     z.last_median,
17 |     z.last_stddev,
18 |     z.last_median_absolute_deviation,
19 |     z.last_mean_absolute_deviation,
20 |     z.last_iqr,
21 |     z.last_first_quartile - (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr) lower_bound,
22 |     z.last_third_quartile + (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr) upper_bound,
23 |     z.last_first_quartile,
24 |     z.last_third_quartile,
25 |     z.time_window_end,
26 |     z.interval_length_sec,
27 |     z.computed_on,
28 |     {{ re_data.generate_anomaly_message('z.column_name', 'z.metric', 'z.last_value', 'z.last_avg') }} as message,
29 |     {{ re_data.generate_metric_value_text('z.metric', 'z.last_value') }} as last_value_text
30 | from
31 |     {{ ref('re_data_z_score')}} z 
32 | left join {{ ref('re_data_selected') }} m 
33 | on {{ split_and_return_nth_value('table_name', '.', 1) }} = m.database
34 | and {{ split_and_return_nth_value('table_name', '.', 2) }} = m.schema
35 | and {{ split_and_return_nth_value('table_name', '.', 3) }} = m.name
36 | where
37 |     case when (lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) = 'up' and z.last_value > z.last_avg)
38 |         or (lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) = 'down' and z.last_value < z.last_avg)
39 |         or (lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) != 'up' and lower(coalesce({{ json_extract('m.anomaly_detector', 'direction') }}, 'both')) != 'down')
40 |         then
41 |             case 
42 |                 when {{ json_extract('m.anomaly_detector', 'name') }} = 'z_score' 
43 |                     then abs(z_score_value) > cast({{ json_extract('m.anomaly_detector', 'threshold') }} as {{ numeric_type() }})
44 |                 when {{ json_extract('m.anomaly_detector', 'name') }} = 'modified_z_score' 
45 |                     then abs(modified_z_score_value) > cast( {{ json_extract('m.anomaly_detector', 'threshold') }} as {{numeric_type()}} )
46 |                 when {{ json_extract('m.anomaly_detector', 'name') }} = 'boxplot' 
47 |                     then (
48 |                         z.last_value < z.last_first_quartile - (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr)
49 |                         or 
50 |                         z.last_value > z.last_third_quartile + (cast( {{ json_extract('m.anomaly_detector', 'whisker_boundary_multiplier') }} as {{numeric_type()}} ) * z.last_iqr)
51 |                     )
52 |                 else false
53 |             end
54 |         else false
55 |     end
56 | 


--------------------------------------------------------------------------------
/macros/public/validating/valid_with_regex.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | {% macro valid_regex(column_name, to_validate) %}
 4 | {% set pattern = re_data.get_regex_for(to_validate) %}
 5 |   case when 
 6 |     {{ column_name }} is null then false 
 7 |     else {{ re_data.regex_match_expression(column_name, pattern) }}
 8 |   end
 9 | {% endmacro %}
10 | 
11 | {% macro valid_email(column_name) %}
12 |     {{ re_data.valid_regex(column_name, 'email')}}
13 | {% endmacro %}
14 | 
15 | {% macro valid_date_eu(column_name) %}
16 |     {{ re_data.valid_regex(column_name, 'date_eu')}}
17 | {% endmacro %}
18 | 
19 | {% macro valid_date_us(column_name) %}
20 |     {{ re_data.valid_regex(column_name, 'date_us')}}
21 | {% endmacro %}
22 | 
23 | {% macro valid_date_inverse(column_name) %}
24 |     {{ re_data.valid_regex(column_name, 'date_inverse')}}
25 | {% endmacro %}
26 | 
27 | {% macro valid_date_iso_8601(column_name) %}
28 |     {{ re_data.valid_regex(column_name, 'date_iso_8601')}}
29 | {% endmacro %}
30 | 
31 | {% macro valid_time_24h(column_name) %}
32 |     {{ re_data.valid_regex(column_name, 'time_24h')}}
33 | {% endmacro %}
34 | 
35 | {% macro valid_time_12h(column_name) %}
36 |     {{ re_data.valid_regex(column_name, 'time_12h')}}
37 | {% endmacro %}
38 | 
39 | {% macro valid_time(column_name) %}
40 |     {{ re_data.valid_regex(column_name, 'time')}}
41 | {% endmacro %}
42 | 
43 | {% macro valid_ip_v4(column_name) %}
44 |     {{ re_data.valid_regex(column_name, 'ipv4_address')}}
45 | {% endmacro %}
46 | 
47 | {% macro valid_ip_v6(column_name) %}
48 |     {{ re_data.valid_regex(column_name, 'ipv6_address')}}
49 | {% endmacro %}
50 | 
51 | {% macro valid_ip(column_name) %}
52 |     (
53 |         {{ re_data.valid_regex(column_name, 'ipv4_address')}}
54 |     or 
55 |         {{ re_data.valid_regex(column_name, 'ipv6_address')}}
56 |     )
57 | {% endmacro %}
58 | 
59 | {% macro valid_number(column_name) %}
60 |     {{ re_data.valid_regex(column_name, 'number_whole')}}
61 | {% endmacro %}
62 | 
63 | {% macro valid_number_decimal_point(column_name) %}
64 |     {{ re_data.valid_regex(column_name, 'number_decimal_point')}}
65 | {% endmacro %}
66 | 
67 | {% macro valid_number_decimal_comma(column_name) %}
68 |     {{ re_data.valid_regex(column_name, 'number_decimal_comma')}}
69 | {% endmacro %}
70 | 
71 | {% macro valid_number_percentage(column_name) %}
72 |     {{ re_data.valid_regex(column_name, 'number_percentage')}}
73 | {% endmacro %}
74 | 
75 | {% macro valid_number_percentage_point(column_name) %}
76 |     {{ re_data.valid_regex(column_name, 'number_percentage_point')}}
77 | {% endmacro %}
78 | 
79 | {% macro valid_number_percentage_comma(column_name) %}
80 |     {{ re_data.valid_regex(column_name, 'number_percentage_comma')}}
81 | {% endmacro %}
82 | 
83 | {% macro valid_phone(column_name) %}
84 |     {{ re_data.valid_regex(column_name, 'phone')}}
85 | {% endmacro %}
86 | 
87 | {% macro valid_uuid(column_name) %}
88 |     {{ re_data.valid_regex(column_name, 'uuid')}}
89 | {% endmacro %}
90 | 
91 | {% macro valid_credit_card(column_name) %}
92 |     {{ re_data.valid_regex(column_name, 'credit_card_number')}}
93 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/dbt_project.yml:
--------------------------------------------------------------------------------
  1 | name: "re_data_integration_tests"
  2 | version: "1.0"
  3 | config-version: 2
  4 | 
  5 | # Changed when running tests
  6 | profile: "re_data_postgres"
  7 | 
  8 | target-path: "target"
  9 | clean-targets: ["target", "dbt_modules", "dbt_packages"]
 10 | 
 11 | models:
 12 |   re_data:
 13 |     enabled: true
 14 | 
 15 | sources:
 16 |   re_data_integration_tests:
 17 |     +re_data_monitored: true
 18 |     +re_data_time_filter: null
 19 |   
 20 | vars:
 21 |   re_data:store_table_samples: true
 22 |   re_data:anomaly_detector:
 23 |     name: modified_z_score
 24 |     threshold: 0.6
 25 |   re_data:max_columns_in_query: 1
 26 | 
 27 |   re_data:select:
 28 |     - tag:testtag
 29 |     - sample_table
 30 |     - sample_without_time_filter
 31 |     - sample_with_anomaly
 32 |     - re_data_source_test_table
 33 | 
 34 |   re_data:metrics_groups:
 35 |     integration_test_group:
 36 |       table:
 37 |         - row_count
 38 |         - freshness
 39 |         - my_distinct_table_rows
 40 | 
 41 |       column:
 42 |         numeric:
 43 |           - min
 44 |           - max
 45 |           - avg
 46 |           - stddev
 47 |           - variance
 48 |           - nulls_count
 49 |           - nulls_percent
 50 |           - diff # my own custom metric
 51 | 
 52 |         text:
 53 |           - min_length
 54 |           - max_length
 55 |           - avg_length
 56 |           - nulls_count
 57 |           - nulls_percent
 58 |           - missing_percent
 59 |           - missing_count
 60 |   
 61 |   re_data:default_metrics:
 62 |     - integration_test_group
 63 | 
 64 | seeds:
 65 |   +schema: seeds
 66 |   +quote_columns: false
 67 | 
 68 |   re_data_integration_tests:
 69 |     monitoring:
 70 |       sample_with_anomaly:
 71 |         +re_data_monitored: true
 72 |         +re_data_time_filter: creation_time
 73 | 
 74 |       sample_without_time_filter:
 75 |         +re_data_monitored: true
 76 |         +re_data_time_filter: null
 77 | 
 78 |       sample_table:
 79 |         +re_data_monitored: true
 80 |         +re_data_time_filter: creation_time
 81 |         
 82 |         +re_data_columns: 
 83 |           - event_type
 84 |           - value1
 85 |           - value2
 86 |           - null_value
 87 |         
 88 |         +re_data_metrics:
 89 |           table:
 90 |             - my_custom_table_metric # my own custom metric
 91 |             - distinct_table_rows
 92 |           column:
 93 |             event_type:
 94 |               - regex_test:
 95 |                   regex: ([A-Za-z0-9]+)
 96 |               - match_regex:
 97 |                   regex: ^sell
 98 |               - match_regex_percent:
 99 |                   regex: ^sell
100 |               - not_match_regex:
101 |                   regex: ^buy
102 |               - not_match_regex_percent:
103 |                   regex: ^buy
104 |               - distinct_values
105 |               - duplicate_values
106 |               - duplicate_rows
107 |               - unique_rows
108 | 
109 |       expected_z_score:
110 |         +column_types:
111 |           time_window_end: "TIMESTAMP"
112 | 
113 |       expected_metrics:
114 |         +column_types:
115 |           time_window_start: "TIMESTAMP"
116 |           time_window_end: "TIMESTAMP"
117 | 


--------------------------------------------------------------------------------
/macros/metrics/base/queries.sql:
--------------------------------------------------------------------------------
 1 | {% macro metrics_base_compute_for_thread(thread_value, ref_model) %}
 2 |     {%- set tables =  run_query(re_data.get_tables()) %}
 3 |     {%- for mtable in tables %}
 4 |         -- we are splitting computing metrics to 4 different threads
 5 |         {% set for_loop_mod = (loop.index % 4) %}
 6 |         {% if for_loop_mod == thread_value %}
 7 |             {% set model = get_model_config(mtable) %}
 8 | 
 9 |             {% set columns_to_query = [] %}
10 |             {% set size = 0 %}
11 | 
12 |             {% for column in model.columns %}
13 |                 {% set column_name = re_data.row_value(column, 'column_name') %}
14 |                 
15 |                 {% if should_compute_metric(model, column_name) %}
16 |                     {% do columns_to_query.append(column) %}
17 |                 {% endif %}
18 | 
19 |                 {% set columns_size = columns_to_query| length %}
20 | 
21 |                 {% if columns_size == var('re_data:max_columns_in_query') %}
22 |                     {%- set insert_stats_query = re_data.metrics_base_insert(model, ref_model, columns_to_query) -%}
23 | 
24 |                     {% if insert_stats_query %}
25 |                         {% do run_query(insert_stats_query) %}
26 |                     {% endif %}
27 |                     {% do columns_to_query.clear() %}
28 |                 {% endif %}
29 |             {% endfor %}
30 | 
31 |             {%- set insert_stats_query = re_data.metrics_base_insert(model, ref_model, columns_to_query, table_level=True) -%}
32 |             {% do run_query(insert_stats_query) %}
33 | 
34 |             {{ dbt_utils.log_info('[re_data_log] - finished computing metrics for:' ~ model.model_name) }}
35 |         {% endif %}
36 |     {% endfor %}
37 | {% endmacro %}
38 | 
39 | {% macro metrics_base_insert(model, ref_model, columns, table_level=False) %}
40 | 
41 |     {% set col_exprs = re_data.metrics_base_expressions(model, columns, table_level) %}
42 |     {% if col_exprs == [] %}
43 |         {{ return ('') }}
44 |     {% endif %}
45 | 
46 |     insert into {{ ref(ref_model) }}
47 |     with temp_table_metrics as (
48 |     select 
49 |         {%- for col_expr in col_exprs %}
50 |             ( {{ col_expr.expr }} ) as {{ re_data.quote_column_name(col_expr.col_name + '___' + col_expr.metric) }}
51 |             {%- if not loop.last %},{%- endif %}
52 |         {% endfor %}
53 |     from 
54 |         {{ model.table_name }}
55 |     where
56 |         {{ in_time_window(model.time_filter) }}
57 |     )
58 | 
59 |     {%- for col_expr in col_exprs %}
60 |         {% set final_metric_name = get_final_metric_name(col_expr.metric, model.time_filter) %}
61 |         
62 |         select '{{model.table_name}}' as table_name, '{{ col_expr.col_name }}' as column_name, '{{ final_metric_name }}' as metric, {{ re_data.quote_column_name(col_expr.col_name + '___' + col_expr.metric) }} as value
63 |         from temp_table_metrics
64 |         {% if not loop.last %}union all{% endif %}
65 |     {% endfor %}
66 | 
67 | {% endmacro %}
68 | 
69 | {% macro get_final_metric_name(metric_name, time_filter) %}
70 |     {% if time_filter is none %}
71 |         {{ return ('global__' + metric_name) }}
72 |     {% else %}
73 |         {{ return (metric_name) }}
74 |     {% endif %}
75 | {% endmacro %}


--------------------------------------------------------------------------------
/integration_tests/python_tests/test_monitoring.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import copy
 3 | import yaml
 4 | import json
 5 | from datetime import datetime, timedelta
 6 | from .utils.run import dbt_seed, dbt_run, dbt_test, dbt_command, dbt_build
 7 | 
 8 | RUN_TIME = datetime(2021, 5, 2, 0, 0, 0)
 9 | 
10 | DBT_VARS = {
11 |     're_data:time_window_start': (RUN_TIME - timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S"),
12 |     're_data:time_window_end': RUN_TIME.strftime("%Y-%m-%d %H:%M:%S"),
13 |     're_data:save_test_history': True
14 | }
15 | 
16 | def test_monitoring(db, source_schema):
17 |     DBT_VARS.update({'source_schema': source_schema})
18 | 
19 |     load_deps = 'dbt deps'
20 |     assert os.system(load_deps) == 0
21 | 
22 |     dbt_vars = copy.deepcopy(DBT_VARS)
23 |     
24 |     print (f"Running setup and tests for {db}")
25 | 
26 |     dbt_seed('--select monitoring', db, dbt_vars)
27 |     dbt_run('--models transformed', db, dbt_vars)
28 |     dbt_command(
29 |         f'dbt run-operation create_test_source_tables',
30 |         db, dbt_vars
31 |     )
32 | 
33 |     print (f"Computing re_data metrics for {db}") 
34 |     dbt_run('--select package:re_data', db, dbt_vars)
35 | 
36 |     dbt_command(
37 |         f'dbt run-operation schema_change_buy_events_add_column',
38 |         db, dbt_vars
39 |     )
40 | 
41 |     # update dbts_vars to run dbt for next day of data
42 |     dbt_vars['re_data:time_window_start'] = dbt_vars['re_data:time_window_end']
43 |     dbt_vars['re_data:time_window_end'] = (RUN_TIME + timedelta(days=1)).strftime("%Y-%m-%d %H:%M:%S")
44 | 
45 |     dbt_command(
46 |         'dbt run --select package:re_data --fail-fast',
47 |         db, dbt_vars
48 |     )
49 | 
50 |     dbt_command(
51 |         'dbt run --select monitoring.*', db, dbt_vars
52 |     )
53 | 
54 |     dbt_test('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score test_re_data_table_samples re_data_metrics transformed', db, dbt_vars)
55 |     # dbt build will "duplicate" saved test result history
56 |     dbt_build('--select test_re_data_anomalies test_re_data_metrics test_re_data_z_score test_re_data_table_samples re_data_metrics transformed', db, dbt_vars)
57 | 
58 |     # tests test_history seperately, because those are actually added to DB after running
59 |     # dbt test command
60 |     dbt_test('--select test_re_data_test_history', db, dbt_vars)
61 | 
62 |     op_vars = {
63 |         'start_date': RUN_TIME.strftime("%Y-%m-%d"),
64 |         'end_date': (RUN_TIME + timedelta(days=1)).strftime("%Y-%m-%d"),
65 |         'interval': 'days:1'
66 |     }
67 |     op_vars = yaml.dump(op_vars)
68 |     
69 |     dbt_command(
70 |         f'dbt run-operation generate_overview --args "{op_vars}"',
71 |         db, dbt_vars
72 |     )
73 | 
74 |     overview = json.load(open(f'../target/re_data/overview.json'))
75 |     expected_types = ['metric', 'schema_change', 'schema', 'alert', 'anomaly']
76 |     all_types = set()
77 | 
78 |     # some simple check for now
79 |     for obj in overview:
80 |         all_types.add(obj['type'])
81 |         assert obj['table_name']
82 |         assert 'column_name' in obj
83 |         assert 'computed_on' in obj
84 | 
85 |     assert len(overview) > 100
86 |     assert sorted(all_types) == sorted(expected_types)
87 | 
88 |     print (f"Running tests completed for {db}")
89 | 


--------------------------------------------------------------------------------
/macros/utils/time_macros.sql:
--------------------------------------------------------------------------------
  1 | 
  2 | {% macro time_window_start() %}
  3 |     cast('{{- var('re_data:time_window_start') -}}' as timestamp) 
  4 | {% endmacro %}
  5 | 
  6 | 
  7 | {% macro time_window_end() %}
  8 |     cast('{{- var('re_data:time_window_end') -}}' as timestamp)
  9 | {% endmacro %}
 10 | 
 11 | 
 12 | {% macro anamaly_detection_time_window_start() %}
 13 |    {{ adapter.dispatch('anamaly_detection_time_window_start', 're_data')() }}
 14 | {% endmacro %}
 15 | 
 16 | {% macro default__anamaly_detection_time_window_start() %}
 17 |     {{ time_window_start() }} - interval '{{var('re_data:anomaly_detection_look_back_days')}} days'
 18 | {% endmacro %}
 19 | 
 20 | {% macro bigquery__anamaly_detection_time_window_start() %}
 21 |     DATE_ADD({{ time_window_start() }}, INTERVAL -{{var('re_data:anomaly_detection_look_back_days')}} DAY)
 22 | {% endmacro %}
 23 | 
 24 | {% macro snowflake__anamaly_detection_time_window_start() %}
 25 |     DATEADD('DAY', -{{-var('re_data:anomaly_detection_look_back_days')-}}, {{ time_window_start() }})
 26 | {% endmacro %}
 27 | 
 28 | 
 29 | {% macro interval_length_sec(start_timestamp, end_timestamp) %}
 30 |     {{ adapter.dispatch('interval_length_sec', 're_data')(start_timestamp, end_timestamp) }}
 31 | {% endmacro %}
 32 | 
 33 | {% macro default__interval_length_sec(start_timestamp, end_timestamp) %}
 34 |    EXTRACT(EPOCH FROM ({{ end_timestamp }} - {{ start_timestamp }} ))
 35 | {% endmacro %}
 36 | 
 37 | {% macro bigquery__interval_length_sec(start_timestamp, end_timestamp) %}
 38 |     TIMESTAMP_DIFF ({{ end_timestamp }}, {{ start_timestamp }}, SECOND)
 39 | {% endmacro %}
 40 | 
 41 | {% macro snowflake__interval_length_sec(start_timestamp, end_timestamp) %}
 42 |    timediff(second, {{ start_timestamp }}, {{ end_timestamp }})
 43 | {% endmacro %}
 44 | 
 45 | {% macro redshift__interval_length_sec(start_timestamp, end_timestamp) %}
 46 |    DATEDIFF(second, {{ start_timestamp }}, {{ end_timestamp }})
 47 | {% endmacro %}
 48 | 
 49 | {%- macro in_time_window(time_column) %}
 50 |     {# /* If not time_filter is specified, we compute the metric over the entire table else we filter for the time frame */ #}
 51 |     {% if time_column is none %}
 52 |             true
 53 |     {% else %}
 54 |         {{ adapter.dispatch('in_time_window', 're_data')(time_column) }}
 55 |     {% endif %}
 56 | {% endmacro -%}
 57 | 
 58 | {% macro default__in_time_window(time_column) %}
 59 |     {{time_column}} >= {{ time_window_start() }} and
 60 |     {{time_column}} < {{ time_window_end() }}
 61 | {% endmacro %}
 62 | 
 63 | {% macro bigquery__in_time_window(time_column) %}
 64 |     cast({{time_column}} as timestamp) >= {{ time_window_start() }} and
 65 |     cast({{time_column}} as timestamp) < {{ time_window_end() }}
 66 | {% endmacro %}
 67 | 
 68 | 
 69 | {% macro format_timestamp(column_name) %}
 70 |     {{ adapter.dispatch('format_timestamp', 're_data')(column_name) }}
 71 | {% endmacro %}
 72 | 
 73 | {% macro default__format_timestamp(column_name) %}
 74 |     to_char({{column_name}}, 'YYYY-MM-DD HH24:MI:SS')
 75 | {% endmacro %}
 76 | 
 77 | {% macro bigquery__format_timestamp(column_name) %}
 78 |     FORMAT_TIMESTAMP('%Y-%m-%d %H:%I:%S', {{column_name}})
 79 | {% endmacro %}
 80 | 
 81 | /*
 82 | provide a common way to compare time vs a range: start_date <= target <= end_date
 83 | if start_date is none: target <= end_date
 84 | if end_date is none: target >= start_date
 85 | think none as infinity
 86 | */
 87 | {%- macro in_date_window(target, start_date, end_date) %}
 88 |   {{ adapter.dispatch('in_date_window','re_data')(target, start_date, end_date) }}
 89 | {% endmacro -%}
 90 | 
 91 | {% macro default__in_date_window(target, start_date, end_date) %}
 92 |   {% if start_date is not none and end_date is not none %}
 93 |     date({{target}}) between '{{start_date}}' and '{{end_date}}'
 94 |   {% elif start_date is none %}
 95 |     date({{target}}) <= '{{end_date}}'
 96 |   {% elif end_date is none %}
 97 |     date({{target}}) >= '{{start_date}}'
 98 |   {% endif %}
 99 | {% endmacro %}
100 | 
101 | 


--------------------------------------------------------------------------------
/models/metrics/for_anomalies/re_data_last_stats.sql:
--------------------------------------------------------------------------------
  1 | {% set columns_to_group_by = 'table_name, column_name, metric, interval_length_sec' %}
  2 | 
  3 | with median_value as (
  4 |     select distinct
  5 |         table_name,
  6 |         column_name,
  7 |         metric,
  8 |         interval_length_sec,
  9 |         avg(value) {% if target.type not in postgres_type_db() %} over(partition by {{ columns_to_group_by }}) {% endif %} as last_avg,
 10 |         {{ percentile(percentile_field='value', partition_field=columns_to_group_by, percent='0.25') }} as last_first_quartile,
 11 |         {{ percentile(percentile_field='value', partition_field=columns_to_group_by, percent='0.5') }} as last_median,
 12 |         {{ percentile(percentile_field='value', partition_field=columns_to_group_by, percent='0.75') }} as last_third_quartile
 13 |     from
 14 |         {{ ref('re_data_base_metrics') }}
 15 |     where
 16 |         time_window_end > {{- anamaly_detection_time_window_start() -}} and
 17 |         time_window_end <= {{- time_window_end() -}}
 18 |     {% if target.type in postgres_type_db() %} 
 19 |         group by
 20 |             {{ columns_to_group_by }}
 21 |     {% endif %}
 22 |     
 23 | ), abs_deviation as (
 24 |     select 
 25 |         s.table_name,
 26 |         s.column_name,
 27 |         s.metric,
 28 |         s.interval_length_sec,
 29 |         abs( s.value - mv.last_avg ) as absolute_deviation_from_mean,
 30 |         abs( s.value - mv.last_median ) as absolute_deviation_from_median
 31 |     from
 32 |         {{ ref('re_data_base_metrics') }} s
 33 |     left join 
 34 |         median_value mv
 35 |         on
 36 |             s.table_name = mv.table_name and
 37 |             s.column_name = mv.column_name and
 38 |             s.metric = mv.metric and
 39 |             s.interval_length_sec = mv.interval_length_sec
 40 |     where
 41 |         s.time_window_end > {{- anamaly_detection_time_window_start() -}} and
 42 |         s.time_window_end <= {{- time_window_end() -}}
 43 | ), median_abs_deviation as (
 44 |     select distinct
 45 |         table_name,
 46 |         column_name,
 47 |         metric,
 48 |         interval_length_sec,
 49 |         avg(absolute_deviation_from_mean) {% if target.type not in postgres_type_db() %} over(partition by {{ columns_to_group_by }}) {% endif %} as mean_absolute_deviation,
 50 |         {{ percentile(percentile_field='absolute_deviation_from_median', partition_field=columns_to_group_by, percent='0.5') }} as median_absolute_deviation
 51 |     from
 52 |         abs_deviation
 53 |     {% if target.type in postgres_type_db() %} 
 54 |         group by
 55 |             {{ columns_to_group_by }}
 56 |     {% endif %}
 57 | ), stats as (
 58 |     select
 59 |         table_name,
 60 |         column_name,
 61 |         metric,
 62 |         stddev(value) as last_stddev,
 63 |         max(time_window_end) as last_metric_time,
 64 |         interval_length_sec,
 65 |         max(computed_on) as computed_on
 66 |     from
 67 |         {{ ref('re_data_base_metrics') }}
 68 |     where
 69 |         time_window_end > {{- anamaly_detection_time_window_start() -}} and
 70 |         time_window_end <= {{- time_window_end() -}}
 71 |     group by
 72 |         {{ columns_to_group_by }}
 73 | )
 74 | 
 75 | select
 76 |     s.table_name,
 77 |     s.column_name,
 78 |     s.metric,
 79 |     mv.last_avg,
 80 |     s.last_stddev,
 81 |     s.last_metric_time,
 82 |     s.interval_length_sec,
 83 |     s.computed_on,
 84 |     mv.last_median,
 85 |     mv.last_first_quartile,
 86 |     mv.last_third_quartile,
 87 |     md.median_absolute_deviation last_median_absolute_deviation,
 88 |     md.mean_absolute_deviation last_mean_absolute_deviation
 89 | from
 90 |     stats s
 91 | left join
 92 |     median_value mv
 93 |     on
 94 |         s.table_name = mv.table_name and
 95 |         s.column_name = mv.column_name and
 96 |         s.metric = mv.metric and
 97 |         s.interval_length_sec = mv.interval_length_sec
 98 | left join
 99 |     median_abs_deviation md
100 |     on 
101 |         s.table_name = md.table_name and
102 |         s.column_name = md.column_name and
103 |         s.metric = md.metric and
104 |         s.interval_length_sec = md.interval_length_sec
105 | 


--------------------------------------------------------------------------------
/macros/metrics/base/build_in/optional_column_metrics.sql:
--------------------------------------------------------------------------------
  1 | {% macro re_data_metric_regex_count(column_name, pattern) %}
  2 |     coalesce(
  3 |         sum(
  4 |             case when {{ regex_match_expression(column_name, pattern) }}
  5 |                 then 1
  6 |             else 0
  7 |             end
  8 |         ), 0
  9 |     )
 10 | {% endmacro %}
 11 | 
 12 | {% macro re_data_metric_match_regex(context) %}
 13 |     {{ re_data_metric_regex_count(context.column_name, context.config.regex) }}
 14 | {% endmacro %}
 15 | 
 16 | {% macro re_data_metric_match_regex_percent(context) %}
 17 |     {{ percentage_formula(re_data_metric_match_regex(context), re_data_metric_row_count()) }}
 18 | {% endmacro %}
 19 | 
 20 | {% macro re_data_metric_not_match_regex(context) %}
 21 |     {{ re_data_metric_row_count() }} - {{ re_data_metric_regex_count(context.column_name, context.config.regex) }}
 22 | {% endmacro %}
 23 | 
 24 | {% macro re_data_metric_not_match_regex_percent(context) %}
 25 |     {{ percentage_formula(re_data_metric_not_match_regex(context), re_data_metric_row_count()) }}
 26 | {% endmacro %}
 27 | 
 28 | {% macro re_data_metric_distinct_values(context) %}
 29 |     {{ distinct_values(context) }}
 30 | {% endmacro %}
 31 | 
 32 | {% macro distinct_values(context) %}
 33 |     {{ adapter.dispatch('distinct_values', 're_data')(context) }}
 34 | {% endmacro %}
 35 | 
 36 | {% macro default__distinct_values(context) %}
 37 |     coalesce(
 38 |         count(distinct {{ context.column_name }} )
 39 |     , 0)
 40 | {% endmacro %}
 41 | 
 42 | {% macro postgres__distinct_values(context) %}
 43 |     {# /* In postgres, its faster to count distinct values in a column by selecting then counting in separate steps */ #}
 44 |     with temp_table as (
 45 |             select distinct {{ context.column_name }} from {{ context.table_name }}
 46 |             where {{ in_time_window(context.time_filter) }}
 47 |         )
 48 |     select coalesce(count(*), 0) from temp_table
 49 | {% endmacro %}
 50 | 
 51 | {% macro re_data_metric_approx_distinct_values(context) %}
 52 |     {{ approx_distinct_values(context) }}
 53 | {% endmacro %}
 54 | 
 55 | {% macro approx_distinct_values(context) %}
 56 |     {{ adapter.dispatch('approx_distinct_values', 're_data')(context) }}
 57 | {% endmacro %}
 58 | 
 59 | {% macro default__approx_distinct_values(context) %}
 60 |     {# /* No approximate distinct count in postgres so we default to using a distinct count */ #}
 61 |     {{ re_data_metric_distinct_values(context) }}
 62 | {% endmacro %}
 63 | 
 64 | {% macro redshift__approx_distinct_values(context) %}
 65 |     approximate {{ re_data_metric_distinct_values(context.column_name) }}
 66 | {% endmacro %}
 67 | 
 68 | {% macro bigquery__approx_distinct_values(context) %}
 69 |     approx_count_distinct({{ context.column_name }})
 70 | {% endmacro %}
 71 | 
 72 | {% macro snowflake__approx_distinct_values(context) %}
 73 |     approx_count_distinct({{ context.column_name }})
 74 | {% endmacro %}
 75 | 
 76 | {% macro re_data_metric_duplicate_values(context) %}
 77 |         with temp_table as (
 78 |             select {{ context.column_name }} from {{ context.table_name }}
 79 |             where {{ in_time_window(context.time_filter) }}
 80 |             group by {{ context.column_name }}
 81 |             having count(1) > 1
 82 |         )
 83 |         select coalesce(count(*), 0) from temp_table
 84 | {% endmacro %}
 85 | 
 86 | {% macro re_data_metric_duplicate_rows(context) %}
 87 |         with temp_table as (
 88 |             select {{ context.column_name }}, count(1) as row_count from {{ context.table_name }}
 89 |             where {{ in_time_window(context.time_filter) }}
 90 |             group by {{ context.column_name }}
 91 |             having count(1) > 1
 92 |         )
 93 |         select coalesce(sum(row_count), 0) from temp_table
 94 | {% endmacro %}
 95 | 
 96 | {% macro re_data_metric_unique_rows(context) %}
 97 |         with temp_table as (
 98 |             select {{ context.column_name }}, count(1) as row_count from {{ context.table_name }}
 99 |             where {{ in_time_window(context.time_filter) }}
100 |             group by {{ context.column_name }}
101 |             having count(1) = 1
102 |         )
103 |         select coalesce(sum(row_count), 0) from temp_table
104 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/meta/get_monitored.sql:
--------------------------------------------------------------------------------
 1 | {% macro pub_monitored_from_graph() %}
 2 |     {% set monitored = [] %}
 3 |     {% set both = []%}
 4 |     {% do both.extend(graph.nodes.values()) %}
 5 |     {% do both.extend(graph.sources.values()) %}
 6 |     {% set owners_config = re_data.get_owners_config() %}
 7 | 
 8 |     {% set select_var = var('re_data:select') %}
 9 |     {% set select_all = true %}
10 | 
11 |     {% set selected_nodes = none %}
12 |     {% set selected_tags = none %}
13 | 
14 |     {% if select_var is not none %}
15 |         {% set select_all = false %}
16 |         {% set selected_nodes = dict() %}
17 |         {% set selected_tags = dict() %}
18 | 
19 |         {% for el in select_var %}
20 |             {% if el.startswith('tag:') %}
21 |                 {% do selected_tags.update({el[4:]: True}) %}
22 |             {% else %}
23 |                 {% do selected_nodes.update({el: True}) %}
24 |             {% endif %}
25 |         {% endfor %}
26 |     {% endif %}
27 | 
28 |     {% for el in both %}
29 |         {% if el.resource_type in ['model', 'seed', 'source'] %}
30 |             {% if el.config.get('re_data_monitored') %}
31 |                 {% set target_name = el.identifier or el.alias or el.name %}
32 | 
33 |                 {% if select_all %}
34 |                     {% set selected = true %}
35 |                 {% else %}
36 |                     {% set selected_name = selected_nodes.get(target_name, false) %}
37 |                     {% set selected_tag = [] %}
38 | 
39 |                     {% for tag in el.tags %}
40 |                         {% if selected_tags.get(tag, false) %}
41 |                             {% do selected_tag.append(true) %}
42 |                         {% endif %}
43 |                     {% endfor %}
44 |                     
45 |                     {% set selected = selected_name or (selected_tag | length > 0) %}
46 |                 {% endif %}
47 | 
48 |                 {% set metrics_groups = el.config.get('re_data_metrics_groups', var('re_data:default_metrics')) %}
49 |                 {% set additional_metrics = el.config.get('re_data_metrics', {}) %}
50 | 
51 |                 {% do monitored.append({
52 |                     'name': re_data.name_in_db(target_name),
53 |                     'schema': re_data.name_in_db(el.schema),
54 |                     'database': re_data.name_in_db(el.database),
55 |                     'time_filter': el.config.get('re_data_time_filter', none),
56 |                     'metrics_groups': metrics_groups,
57 |                     'additional_metrics': re_data.metrics_in_db(additional_metrics),
58 |                     'metrics': re_data.metrics_in_db(re_data.final_metrics(metrics_groups, additional_metrics)),
59 |                     'columns': re_data.columns_in_db(el.config.get('re_data_columns', none)),
60 |                     'anomaly_detector': el.config.get('re_data_anomaly_detector', var('re_data:anomaly_detector', {})),
61 |                     'owners': re_data.prepare_model_owners(el.config.get('re_data_owners', []), owners_config),
62 |                     'selected': selected
63 |                     })
64 |                 %}
65 |             {% endif %}
66 |         {% endif %}
67 |     {% endfor %}
68 | 
69 |     {{ return(monitored) }}
70 | {% endmacro %}
71 | 
72 | {% macro get_owners_config() %}
73 |     {% set owners_config = var('re_data:owners_config', {}) %}
74 |     {{ return (owners_config) }}
75 | {% endmacro %}
76 | 
77 | {% macro prepare_model_owners(re_data_owners, owners_config) %}
78 |     {% set owners = {} %}
79 |     {% set seen_identifiers = {} %}
80 |     {% for owner in re_data_owners if owners_config.get(owner) %}
81 |         {% set members = owners_config.get(owner) %}
82 |         {% for member in members %}
83 |             {% set identifier = member.get('identifier') %}
84 |             {% if identifier not in seen_identifiers %}
85 |             {% do seen_identifiers.update({identifier: true }) %}
86 |             {% do owners.update({
87 |                 identifier: {
88 |                     'notify_channel': member.get('type'),
89 |                     'owner': owner,
90 |                     'name': member.get('name') 
91 |                 } 
92 |             }) %}
93 |             {% endif %}
94 |         {% endfor %}
95 |     {% endfor %}
96 |     {{ return (owners) }}
97 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/metrics/base/expression.sql:
--------------------------------------------------------------------------------
  1 | {% macro metrics_base_expressions(model, columns, table_level=False) %}
  2 | 
  3 |     {% set col_expr = [] %}
  4 | 
  5 |     {% for col in columns %}
  6 |         {% set column_name = re_data.row_value(col, 'column_name') %}
  7 |         {% do col_expr.extend(re_data.metrics_base_expression_column_all(model, col)) %}
  8 |     {% endfor %}
  9 | 
 10 |     {% if table_level %}
 11 |         {% do col_expr.extend(re_data.metrics_base_expresion_table_all(model)) %}
 12 |     {% endif %}
 13 | 
 14 |     {{ return (col_expr) }}
 15 | 
 16 | {% endmacro %}
 17 | 
 18 | {% macro metrics_base_expression_column_all(model, column) %}
 19 | 
 20 |     {%- set col_expr = [] %}
 21 |     {%- set metrics_to_compute = [] %}
 22 |     {% set column_name = re_data.row_value(column, 'column_name') %}
 23 |     {% set data_type = model.columns_info[column_name].data_type %}
 24 |     {% do metrics_to_compute.extend(model.metrics.get('group').get('column', {}).get(data_type, [])) %}
 25 |     {% do metrics_to_compute.extend(model.metrics.get('additional').get('column', {}).get(column_name, [])) %} 
 26 | 
 27 |     {% for metric_value in metrics_to_compute %}
 28 |         {% set metric_obj = re_data.extract_metric_config(metric_value) %}
 29 |         {% set expression = re_data.metrics_base_expression_column(model, column_name, metric_obj['metric'], metric_obj['config']) %}
 30 |         {% do col_expr.append({ 'expr': expression, 'col_name': column_name, 'metric': metric_obj['metric']}) %}
 31 |     {% endfor %}
 32 | 
 33 |     {{ return (col_expr) }}
 34 | 
 35 | {% endmacro %}
 36 | 
 37 | 
 38 | {% macro metrics_base_expresion_table_all(model) %}
 39 |     {%- set table_expr = [] %}
 40 |     {%- set metrics_to_compute = [] %}
 41 |     {% do metrics_to_compute.extend(model.metrics.get('group').get('table', [])) %}
 42 |     {% do metrics_to_compute.extend(model.metrics.get('additional').get('table', [])) %}
 43 | 
 44 |     {% for metric_value in metrics_to_compute %}
 45 |         {% set metric_obj = re_data.extract_metric_config(metric_value) %}
 46 |         {% set expression = re_data.metrics_base_expression_table(model, metric_obj['metric'], metric_obj['config']) %}
 47 |         {% do table_expr.append({ 'expr': expression, 'col_name': '', 'metric': metric_obj['metric']}) %}
 48 |     {% endfor %}
 49 | 
 50 |     {{ return (table_expr) }}
 51 | 
 52 | {% endmacro %}
 53 | 
 54 | {% macro metrics_base_expression_table(model, metric_name, config) %}
 55 |     {% set metric_macro = re_data.get_metric_macro(metric_name) %}
 56 |     {% set context = {'time_filter': model.time_filter, 'metric_name': metric_name, 'config': config, 'table_name': model.table_name, 'column_name': none} %}
 57 | 
 58 |     {{ metric_macro(context) }}
 59 | 
 60 | {% endmacro %}
 61 | 
 62 | 
 63 | {%- macro metrics_base_expression_column(model, column_name, metric_name, config) %}
 64 |     {% set metric_macro = re_data.get_metric_macro(metric_name) %}
 65 |     {% set context = {'time_filter': model.time_filter, 'metric_name': metric_name, 'config': config, 'table_name': model.table_name, 'column_name': re_data.quote_column_name(column_name)} %}
 66 | 
 67 |     {{ metric_macro(context) }}
 68 | 
 69 | {% endmacro %}
 70 | 
 71 | {% macro extract_metric_config(metric_value) %}
 72 | 
 73 |     {% set config = none %}
 74 | 
 75 |     {% if metric_value is mapping %}
 76 |         {% set metric = metric_value.keys() | first %}
 77 |         {% if metric_value[metric] is none %}
 78 |             {{ exceptions.raise_compiler_error("Empty configuration passed for metric: " ~ metric ~ ". If the metric doesn't use a config, please use the column name as a string.") }}
 79 |         {% endif %}
 80 | 
 81 |         {% set config = metric_value[metric] %}
 82 |     {%- else %}
 83 |         {% set metric = metric_value %}
 84 |     {% endif %}
 85 | 
 86 |     {{ return ({'metric': metric, 'config': config}) }}
 87 | 
 88 | {% endmacro %}
 89 | 
 90 | {%- macro get_metric_macro(metric_name) %}
 91 |     {% set macro_name = 're_data_metric' + '_' + metric_name %}
 92 | 
 93 |     {% if context['re_data'].get(macro_name) %}
 94 |         {% set metric_macro = context['re_data'][macro_name] %}
 95 |     {%- else %}
 96 |         {% set metric_macro = context[project_name][macro_name] %}
 97 |     {% endif %}
 98 | 
 99 |     {{ return (metric_macro) }}
100 | 
101 | {% endmacro %}
102 | 
103 | 


--------------------------------------------------------------------------------
/macros/public/validating/regex_dict.sql:
--------------------------------------------------------------------------------
 1 | {#
 2 | #  This file contains significant part of code licensed under:
 3 | #  Copyright 2020 Soda
 4 | #  Licensed under the Apache License, Version 2.0 (the "License");
 5 | #  you may not use this file except in compliance with the License.
 6 | #  You may obtain a copy of the License at
 7 | #   http://www.apache.org/licenses/LICENSE-2.0
 8 | #  Unless required by applicable law or agreed to in writing, software
 9 | #  distributed under the License is distributed on an "AS IS" BASIS,
10 | #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 | #  See the License for the specific language governing permissions and
12 | #  limitations under the License.
13 | #}
14 | 
15 | {% macro get_regex_for(to_validate) %}
16 | 
17 |     {% set regexp_dict = {
18 |         'number_whole': '^\-?[0-9]+$',
19 |         'number_decimal_point': '^\-?[0-9]+\.[0-9]+$',
20 |         'number_decimal_comma': '^\-?[0-9]+,[0-9]+$',
21 |         'number_percentage': '^\-?[0-9]+([\.,][0-9]+)? ?%$',
22 |         'number_percentage_point': '^\-?[0-9]+([\.][0-9]+)? ?%$',
23 |         'number_percentage_comma': '^\-?[0-9]+([,][0-9]+)? ?%$',
24 |         'date_eu': '^([1-9]|0[1-9]|[12][0-9]|3[01])[-\./]([1-9]|0[1-9]|1[012])[-\./](19|20)?[0-9][0-9]$',
25 |         'date_us': '^([1-9]|0[1-9]|1[012])[-\./]([1-9]|0[1-9]|[12][0-9]|3[01])[-\./](19|20)?[0-9][0-9]$',
26 |         'date_inverse': '^(19|20)[0-9][0-9][-\./]?([1-9]|0[1-9]|1[012])[-\./]?([1-9]|0[1-9]|[12][0-9]|3[01])$',
27 |         'time_24h': '^([01][0-9]|2[0-3]):([0-5][0-9])$',
28 |         'time_12h': '^(1[0-2]|0?[1-9]):[0-5][0-9]$',
29 |         'time': '^([0-9]|1[0-9]|2[0-4])[:-]([0-9]|[0-5][0-9])([:-]([0-9]|[0-5][0-9])(,[0-9]+)?)?$',
30 |         'date_iso_8601':
31 |             '^'
32 |             '([1-9][0-9]{3}-((0[1-9]|1[0-2])-(0[1-9]|1[0-9]|2[0-8])|(0[13-9]|1[0-2])-(29|30)|(0[13578]|1[02])-31)|'
33 |             '([1-9][0-9](0[48]|[2468][048]|[13579][26])|([2468][048]|[13579][26])00)-02-29)'
34 | 
35 |             'T([01][0-9]|2[0-3]):[0-5][0-9]:[0-5][0-9](\.[0-9]+)?'
36 | 
37 |             '(Z|[+-][01][0-9]:[0-5][0-9])?'
38 |             '$',
39 |         'uuid': '^[0-9a-fA-F]{8}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{4}\-[0-9a-fA-F]{12}$',
40 |         'ipv4_address': '^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$',
41 |         'ipv6_address': '^((([0-9A-Fa-f]{1,4}:){7}([0-9A-Fa-f]{1,4}|:))|(([0-9A-Fa-f]{1,4}:){6}(:[0-9A-Fa-f]{1,4}|((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){5}(((:[0-9A-Fa-f]{1,4}){1,2})|:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3})|:))|(([0-9A-Fa-f]{1,4}:){4}(((:[0-9A-Fa-f]{1,4}){1,3})|((:[0-9A-Fa-f]{1,4})?:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){3}(((:[0-9A-Fa-f]{1,4}){1,4})|((:[0-9A-Fa-f]{1,4}){0,2}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){2}(((:[0-9A-Fa-f]{1,4}){1,5})|((:[0-9A-Fa-f]{1,4}){0,3}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(([0-9A-Fa-f]{1,4}:){1}(((:[0-9A-Fa-f]{1,4}){1,6})|((:[0-9A-Fa-f]{1,4}){0,4}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:))|(:(((:[0-9A-Fa-f]{1,4}){1,7})|((:[0-9A-Fa-f]{1,4}){0,5}:((25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])(\.(25[0-5]|2[0-4][0-9]|1[0-9][0-9]|[1-9]?[0-9])){3}))|:)))(%.+)?$',
42 |         'email': '^[A-Za-z0-9.-_%]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}$',
43 |         } %}
44 | 
45 |     {% set base_regex = regexp_dict[to_validate] %}
46 |     {% set qualifed_regex = adapter.dispatch('get_regex_for', 're_data')(base_regex) %}
47 |     {{ return(qualifed_regex) }}
48 | 
49 | {% endmacro %}
50 | 
51 | {% macro default__get_regex_for(pattern) %}
52 |     {{ return (pattern) }}
53 | {% endmacro %}
54 | 
55 | {% macro redshift__get_regex_for(pattern) %}
56 |     {% set changed = modules.re.sub('\.', '\\.', pattern) %}
57 |     {% set changed = modules.re.sub('\-', '\\-', changed) %}
58 |     {{ return (changed) }}
59 | {% endmacro %}
60 | 
61 | {% macro snowflake__get_regex_for(pattern) %}
62 |     {% set changed = modules.re.sub('\.', '\\.', pattern) %}
63 |     {% set changed = modules.re.sub('\-', '\\-', changed) %}
64 |     {{ return (changed) }}
65 | {% endmacro %}


--------------------------------------------------------------------------------
/macros/run_end/save_results_history.sql:
--------------------------------------------------------------------------------
  1 | 
  2 | {% macro save_test_history(results) -%}
  3 | 
  4 | {{ adapter.dispatch('save_test_history_impl', 're_data') (results) }}
  5 | 
  6 | {%- endmacro %}
  7 | 
  8 | {% macro default__save_test_history_impl(results) %}
  9 |     -- depends_on: {{ ref('re_data_test_history') }}
 10 |     {% set command = flags.WHICH %}
 11 |     {% if execute and results and command in ('test', 'build') %}
 12 |         {% set tests = [] %}
 13 |         {% for el in results %}
 14 |             {% if el.node.resource_type.value == 'test' %}
 15 |                 {% do tests.append(re_data.test_data_dict(el)) %}
 16 |             {% endif %}
 17 |         {% endfor %}
 18 | 
 19 |         {% if tests %}
 20 |             {% do re_data.insert_list_to_table(
 21 |                 ref('re_data_test_history'),
 22 |                 tests,
 23 |                 ['table_name', 'column_name', 'test_name', 'status', 'execution_time', 'message', 'failures_count', 'failures_json', 'failures_table', 'severity', 'compiled_sql', 'run_at'],
 24 |                 { 'run_at': timestamp_type() }
 25 |             ) %}
 26 |         {% endif %}
 27 | 
 28 |     {% endif %}
 29 |     {{ return ('') }}
 30 | 
 31 | {% endmacro %}
 32 | 
 33 | {% macro test_data_dict(el) %}
 34 | 
 35 |     {% set run_started_at_str = run_started_at.strftime('%Y-%m-%d %H:%M:%S') %}
 36 | 
 37 |     {% if el.node.to_dict().get('test_metadata') %}
 38 |         {% set any_refs = modules.re.findall("ref\(\'(?P<name>.*)\'\)", el.node.test_metadata.kwargs['model']) %}
 39 |         {% set any_source = modules.re.findall("source\(\'(?P<one>.*)\'\,\s+\'(?P<two>.*)\'\)", el.node.test_metadata.kwargs['model']) %}
 40 | 
 41 |         {% if any_refs %}
 42 |             {% set name = any_refs[0] %}
 43 |             {% set node_name = re_data.priv_full_name_from_depends(el.node, name) %}
 44 |             {% set schema = graph.nodes.get(node_name)['schema'] %}
 45 |             {% set database = graph.nodes.get(node_name)['database'] %}
 46 |             {% set table_name = (database + '.' + schema + '.' + name) | lower %} 
 47 |             
 48 |         {% elif any_source %}
 49 |             {% set package_name = any_source[0][0] %}
 50 |             {% set name = any_source[0][1] %}
 51 |             {% set node_name = re_data.priv_full_name_from_depends(el.node, name) %}
 52 |             {% set schema = graph.sources.get(node_name)['schema'] %}
 53 |             {% set database = graph.sources.get(node_name)['database'] %}
 54 |             {% set table_name = (database + '.' + schema + '.' + name) | lower %}
 55 |         {% else %}
 56 |             {% set table_name = none %}
 57 |         {% endif %}
 58 |     {% else %}
 59 |         {% set table_name = none %}
 60 |     {% endif %}
 61 | 
 62 |     {% if var.has_var('re_data:query_test_failures') %}
 63 |         {% set query_failures = var('re_data:query_test_failures') %}
 64 |     {% else %}
 65 |         {% set query_failures = true %}
 66 |     {% endif %}
 67 | 
 68 |     {% if el.failures and el.failures > 0 and el.node.relation_name and query_failures %}
 69 |         {% if var.has_var('re_data:test_history_failures_limit') %}
 70 |             {% set limit_count = var('re_data:test_history_failures_limit')%}
 71 |         {% else %}
 72 |             {% set limit_count = 10 %}
 73 |         {% endif %}
 74 | 
 75 |         {% set failures_query %}
 76 |             select * from {{ el.node.relation_name}} limit {{ limit_count }}
 77 |         {% endset %}
 78 |         {% set failures_list = re_data.agate_to_list(run_query(failures_query)) %}
 79 |     {% endif %}
 80 | 
 81 |     {% set failures_json = none %}
 82 | 
 83 |     {{ return ({
 84 |         'table_name': table_name,
 85 |         'column_name': el.node.column_name or none,
 86 |         'test_name': el.node.name,
 87 |         'status': el.status.name,
 88 |         'execution_time': el.execution_time,
 89 |         'message': el.message,
 90 |         'failures_count': el.failures,
 91 |         'failures_json': '' ~ failures_list,
 92 |         'failures_table': el.node.relation_name or none,
 93 |         'severity': el.node.config.severity,
 94 |         'compiled_sql': el.node.compiled_sql or el.node.compiled_code or none,
 95 |         'run_at': run_started_at_str,
 96 |         })
 97 |     }}
 98 | 
 99 | {% endmacro %}
100 | 
101 | {% macro priv_full_name_from_depends(node, name) %}
102 | 
103 |     {% for full_name in node.depends_on.nodes %}
104 |         {% set node_name = full_name.split('.')[-1] %}
105 |         {% if node_name == name %}
106 |             {{ return(full_name) }}
107 |         {% endif %}
108 |     {% endfor %}
109 | 
110 |     {{ return(none) }}
111 | 
112 | {% endmacro %}
113 | 


--------------------------------------------------------------------------------
/macros/public/store/generate_overview.sql:
--------------------------------------------------------------------------------
 1 | 
 2 | {% macro overview_select_base(type, timestamp_col) %}
 3 |     '{{ type }}' as {{ re_data.quote_column('type') }},
 4 |     table_name as {{ re_data.quote_column('table_name') }},
 5 |     column_name as {{ re_data.quote_column('column_name') }},
 6 |     {{ timestamp_col }} as {{ re_data.quote_column('computed_on') }},
 7 | {% endmacro %}
 8 | 
 9 | {% macro generate_overview(start_date, end_date, interval, overview_path=None, monitored_path=None) %}
10 | -- depends_on: {{ ref('re_data_anomalies') }}
11 | -- depends_on: {{ ref('re_data_base_metrics') }}
12 | -- depends_on: {{ ref('re_data_schema_changes') }}
13 | -- depends_on: {{ ref('re_data_columns') }}
14 | 
15 |     {# time grain is either days or hour #}
16 |     {% set time_grain, num_str = interval.split(':') %}
17 |     {% set num = num_str | int %}
18 |     {% if time_grain == 'hours' %}
19 |         {% set interval_length_sec = num * 3600 %}
20 |     {% elif time_grain == 'days'%}
21 |         {% set interval_length_sec = num * 3600 * 24 %}
22 |     {% else %}
23 |         {{ exceptions.raise_compiler_error("Invalid interval. Got: " ~ interval) }}
24 |     {% endif %}
25 |     {{ dbt_utils.log_info('[re_data] interval length in seconds is ' ~ interval_length_sec) }}
26 |     {% set overview_query %}
27 |         with schema_changes_casted as (
28 |             select id, table_name, operation, column_name, data_type, {{ bool_to_string('is_nullable') }}, prev_column_name, prev_data_type, {{ bool_to_string('prev_is_nullable') }}, detected_time
29 |             from {{ ref('re_data_schema_changes') }}
30 |         ),
31 |         columns_casted as (
32 |             select {{ full_table_name('name', 'schema', 'database') }} as table_name, column_name, data_type, {{ bool_to_string('is_nullable') }}, computed_on
33 |             from {{ ref('re_data_columns') }} 
34 |         )
35 |         
36 |     (    
37 |         select
38 |             {{ overview_select_base('metric', 'computed_on')}}
39 |             {{ to_single_json(['metric', 'value', 'time_window_end', 'interval_length_sec']) }} as {{ re_data.quote_column('data') }}
40 |         from
41 |             {{ ref('re_data_base_metrics') }}
42 |             where {{ in_date_window('time_window_end', start_date, end_date) }}
43 |             and interval_length_sec = {{interval_length_sec}}
44 |     ) union all 
45 |     (
46 |         select
47 |             {{ overview_select_base('anomaly', 'computed_on')}}
48 |             {{ to_single_json(['id', 'metric', 'z_score_value', 'last_value', 'last_avg', 'last_stddev', 'time_window_end', 'interval_length_sec']) }} as {{ re_data.quote_column('data') }}
49 |         from
50 |             {{ ref('re_data_anomalies') }}
51 |             where {{ in_date_window('time_window_end', start_date, end_date) }}
52 |             and interval_length_sec = {{interval_length_sec}}
53 |     ) union all
54 |     (
55 |         select
56 |             {{ overview_select_base('schema_change', 'detected_time')}}
57 |             {{ to_single_json(['id', 'operation', 'data_type', 'is_nullable', 'prev_column_name', 'prev_data_type', 'prev_is_nullable', 'detected_time']) }} as {{ re_data.quote_column('data') }}
58 |         from
59 |             schema_changes_casted
60 |             where {{ in_date_window('detected_time', start_date, none) }}
61 |     ) union all
62 |     (
63 |         select
64 |             {{ overview_select_base('schema', 'computed_on')}}
65 |             {{ to_single_json(['data_type', 'is_nullable']) }} as {{ re_data.quote_column('data') }}
66 |         from
67 |             columns_casted
68 |     )
69 |     union all 
70 |     (
71 |         select 
72 |             'alert' as {{ re_data.quote_column('type') }},
73 |             model as {{ re_data.quote_column('table_name') }},
74 |             null as {{ re_data.quote_column('column_name') }},
75 |             time_window_end as {{ re_data.quote_column('computed_on') }},
76 |             {{ to_single_json(['type', 'model', 'message', 'value', 'time_window_end']) }} as {{ re_data.quote_column('data') }}
77 |         from
78 |             {{ ref('re_data_alerts') }}
79 |         where
80 |             case
81 |                 when type = 'anomaly' then {{ in_date_window('time_window_end', start_date, end_date)  }} 
82 |                 else {{ in_date_window('time_window_end', start_date, none) }}
83 |             end
84 |     )
85 |     order by {{ re_data.quote_column('computed_on')}} desc
86 |     {% endset %}
87 | 
88 |     {% set overview_result = run_query(overview_query) %}
89 |     {% set overview_file_path = overview_path or '../target/re_data/overview.json' %}
90 |     {% do overview_result.to_json(overview_file_path) %}
91 |     {{ save_monitored(monitored_path) }}
92 | 
93 | {% endmacro %}
94 | 


--------------------------------------------------------------------------------
/macros/tests/test_metrics.sql:
--------------------------------------------------------------------------------
  1 | 
  2 | {% macro metric_expression(table, metric, expression, column_name=None, condition=None) %}
  3 |     select * from {{ref('re_data_base_metrics')}}
  4 |     where
  5 |         table_name = '{{ re_data.full_table_name_values(table.identifier, table.schema, table.database)}}' and
  6 |         metric = '{{ metric }}' and
  7 |         {% if condition is not none %}
  8 |             {{ condition }} and
  9 |         {% endif %}
 10 |         {% if column_name is none %}
 11 |         not ( {{ expression }} )
 12 |         {% else %}
 13 |         column_name = '{{ column_name }}' and
 14 |         not ( {{ expression }} )
 15 |         {% endif %}
 16 | 
 17 | {% endmacro %}
 18 | 
 19 | {# old test macros, will be removed after some time #}
 20 | {% test metric_expression_is_true(model, table, metric, expression, column_name=None, condition=None) %}
 21 |     {{ re_data.metric_expression(table, metric, expression, column_name=None, condition=None) }}
 22 | {% endtest %}
 23 | 
 24 | 
 25 | {% test metric_equal_to(model, table, metric, value, column_name=None, condition=None) %}
 26 |     {{ re_data.metric_expression(table, metric, 'value = ' ~ value, column_name, condition) }}
 27 | {% endtest %}
 28 | 
 29 | 
 30 | {% test metric_in_range(model, table, metric, min_value, max_value, column_name=None, condition=None) %}
 31 |     {{ re_data.metric_expression(table, metric, 'value >= ' ~ min_value ~ ' and value <= ' ~ max_value, column_name, condition) }}
 32 | {% endtest %}
 33 | 
 34 | {# new test macros #}
 35 | 
 36 | {% test assert_true(model, column_name=None, metric=None, expression=expression, condition=None) %}
 37 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 38 |     {% if execute %}
 39 |         {{ re_data.metric_expression(model, metric, expression, column_name, condition) }}
 40 |     {% else %}
 41 |         {{ re_data.empty_table() }}
 42 |     {% endif %}
 43 | {% endtest %}
 44 | 
 45 | {% test assert_false(model, column_name=None, metric=None, expression=expression, condition=None) %}
 46 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 47 |     {% if execute %}
 48 |         {{ re_data.metric_expression(model, metric, 'not (' ~ expression ~ ')', column_name, condition) }}
 49 |     {% else %}
 50 |         {{ re_data.empty_table() }}
 51 |     {% endif %}
 52 | {% endtest %}
 53 | 
 54 | {% test assert_in_range(model, column_name=None, metric=None, min_value=None, max_value=None, condition=None) %}
 55 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 56 |     {% if execute %}
 57 |         {{ re_data.metric_expression(model, metric, 'value >= ' ~ min_value ~ ' and value <= ' ~ max_value, column_name, condition) }}
 58 |     {% else %}
 59 |         {{ re_data.empty_table() }}
 60 |     {% endif %}
 61 | {% endtest %}
 62 | 
 63 | {% test assert_equal(model, column_name=None, metric=None, value=value, condition=None) %}
 64 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 65 |     {% if execute %}
 66 |         {{ re_data.metric_expression(model, metric, 'value = ' ~ value, column_name, condition) }}
 67 |     {% else %}
 68 |         {{ re_data.empty_table() }}
 69 |     {% endif %}
 70 | {% endtest %}
 71 | 
 72 | {% test assert_greater(model, column_name=None, metric=None, value=None, condition=None) %}
 73 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 74 |     {% if execute %}
 75 |         {{ re_data.metric_expression(model, metric, 'value > ' ~ value, column_name, condition) }}
 76 |     {% else %}
 77 |         {{ re_data.empty_table() }}
 78 |     {% endif %}
 79 | {% endtest %}
 80 | 
 81 | {% test assert_greater_equal(model, column_name=None, metric=None, value=None, condition=None) %}
 82 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 83 |     {% if execute %}
 84 |         {{ re_data.metric_expression(model, metric, 'value >= ' ~ value, column_name, condition) }}
 85 |     {% else %}
 86 |         {{ re_data.empty_table() }}
 87 |     {% endif %}
 88 | {% endtest %}
 89 | 
 90 | {% test assert_less(model, column_name=None, metric=None, value=None, condition=None) %}
 91 |     -- depends_on: {{ ref('re_data_base_metrics') }}
 92 |     {% if execute %}
 93 |         {{ re_data.metric_expression(model, metric, 'value < ' ~ value, column_name, condition) }}
 94 |     {% else %}
 95 |         {{ re_data.empty_table() }}
 96 |     {% endif %}
 97 | {% endtest %}
 98 | 
 99 | {% test assert_less_equal(model, column_name=None, metric=None, value=None, condition=None) %}
100 |     -- depends_on: {{ ref('re_data_base_metrics') }}
101 |     {% if execute %}
102 |         {{ re_data.metric_expression(model, metric, 'value <= ' ~ value, column_name, condition) }}
103 |     {% else %}
104 |         {{ re_data.empty_table() }}
105 |     {% endif %}
106 | {% endtest %}


--------------------------------------------------------------------------------
/.github/workflows/run-db-tests.yml:
--------------------------------------------------------------------------------
  1 | name: Run package tests for all dbs
  2 | 
  3 | on: [push, delete]
  4 | 
  5 | env:
  6 |   DBT_PROFILES_DIR: ${{ github.workspace }}/
  7 |   SNOWFLAKE_RE_DATA_TESTING_ACCOUNT: ${{ secrets.SNOWFLAKE_RE_DATA_TESTING_ACCOUNT }}
  8 |   RE_DATA_TESTING_USER: ${{ secrets.RE_DATA_TESTING_USER }}
  9 |   RE_DATA_TESTING_PASSWORD: ${{ secrets.RE_DATA_TESTING_PASSWORD }}
 10 |   REDSHIFT_RE_DATA_TESTING_HOST: ${{ secrets.REDSHIFT_RE_DATA_TESTING_HOST }}
 11 |   BIGQUERY_TESTING_TYPE: ${{ secrets.BIGQUERY_TESTING_TYPE }}
 12 |   BIGQUERY_TESTING_PROJECT_ID: ${{ secrets.BIGQUERY_TESTING_PROJECT_ID }}
 13 |   BIGQUERY_TESTING_PRIVATE_KEY_ID: ${{ secrets.BIGQUERY_TESTING_PRIVATE_KEY_ID }}
 14 |   BIGQUERY_TESTING_PRIVATE_KEY: ${{ secrets.BIGQUERY_TESTING_PRIVATE_KEY }}
 15 |   BIGQUERY_TESTING_CLIENT_EMAIL: ${{ secrets.BIGQUERY_TESTING_CLIENT_EMAIL }}
 16 |   BIGQUERY_TESTING_CLIENT_ID: ${{ secrets.BIGQUERY_TESTING_CLIENT_ID }}
 17 |   BIGQUERY_TESTING_AUTH_URI: ${{ secrets.BIGQUERY_TESTING_AUTH_URI }}
 18 |   BIGQUERY_TESTING_TOKEN_URI: ${{ secrets.BIGQUERY_TESTING_TOKEN_URI }}
 19 |   BIGQUERY_TESTING_AUTH_PROVIDER_X509_CERT_URL: ${{ secrets.BIGQUERY_TESTING_AUTH_PROVIDER_X509_CERT_URL }}
 20 |   BIGQUERY_TESTING_CLIENT_X509_CERT_URL: ${{ secrets.BIGQUERY_TESTING_CLIENT_X509_CERT_URL }}
 21 |   DBT_VERSION: 1.7
 22 |   PYTHON_VERSION: "3.8.x"
 23 | 
 24 | jobs:
 25 |   test-postgres:
 26 |     runs-on: ubuntu-latest
 27 |     if: github.event_name == 'push'
 28 |     services:
 29 |       postgres:
 30 |         image: postgres
 31 |         env:
 32 |           POSTGRES_PASSWORD: postgres
 33 |         # Set health checks to wait until postgres has started
 34 |         options: >-
 35 |           --health-cmd pg_isready
 36 |           --health-interval 10s
 37 |           --health-timeout 5s
 38 |           --health-retries 5
 39 |         ports:
 40 |           # Maps tcp port 5432 on service container to the host
 41 |           - 5432:5432
 42 |     steps:        
 43 |       - name: Check out
 44 |         uses: actions/checkout@v2
 45 | 
 46 |       - uses: actions/setup-python@v4
 47 |         with:
 48 |           python-version: ${{ env.PYTHON_VERSION }}
 49 | 
 50 |       - name: Install dependencies
 51 |         working-directory: ./integration_tests
 52 |         run: |
 53 |           pip install -r requirements.txt
 54 |           pip install dbt-postgres==$DBT_VERSION
 55 |           dbt deps
 56 | 
 57 |       - name: Test DB
 58 |         working-directory: ./integration_tests/python_tests
 59 |         run: pytest --db postgres --source_schema dq
 60 | 
 61 |   test-other-dbs:
 62 |     runs-on: ubuntu-latest
 63 |     if: github.event_name == 'push' && github.repository == 're-data/dbt-re-data' && github.ref == 'refs/heads/main'
 64 |     strategy:
 65 |       fail-fast: false
 66 |       matrix:
 67 |         database: [snowflake, bigquery, redshift]
 68 |     steps:        
 69 |       - name: Check out
 70 |         uses: actions/checkout@v2
 71 | 
 72 |       - uses: actions/setup-python@v4
 73 |         with:
 74 |           python-version: ${{ env.PYTHON_VERSION }}
 75 | 
 76 |       - name: Inject slug/short variables
 77 |         uses: rlespinasse/github-slug-action@v3.x
 78 | 
 79 |       - name: Set the DQ_SCHEMA environment variable
 80 |         shell: bash
 81 |         run: |
 82 |           echo "DQ_SCHEMA=dq_${GITHUB_REF_SLUG//[^[:alnum:]]/_}" >> $GITHUB_ENV
 83 | 
 84 |       - name: Print DQ_SCHEMA
 85 |         run: |
 86 |           echo $DQ_SCHEMA
 87 | 
 88 |       - name: Install dependencies
 89 |         working-directory: ./integration_tests
 90 |         run: |
 91 |           pip install -r requirements.txt
 92 |           pip install dbt-${{ matrix.database }}==$DBT_VERSION
 93 |           dbt deps
 94 | 
 95 |       - name: Drop schemas
 96 |         working-directory: ./integration_tests
 97 |         run: |
 98 |           dbt run-operation drop_all_schemas --args "{ schema_name: ${{ env.DQ_SCHEMA }} }" --profile re_data_${{ matrix.database }} --vars "{ source_schema: ${{ env.DQ_SCHEMA }} }"
 99 | 
100 |       - name: Create Schemas if needed
101 |         if: matrix.database == 'redshift'
102 |         working-directory: ./integration_tests
103 |         run: |
104 |           dbt run-operation create_required_schemas --args "{ schema_name: ${{ env.DQ_SCHEMA }} }" --profile re_data_${{ matrix.database }} --vars "{ source_schema: ${{ env.DQ_SCHEMA }} }"
105 | 
106 |       - name: Test DB
107 |         working-directory: ./integration_tests/python_tests
108 |         run: |
109 |           pytest --db ${{ matrix.database }} --source_schema ${{ env.DQ_SCHEMA }}
110 | 
111 |   clean-up-schemas:
112 |     runs-on: ubuntu-latest
113 |     if: github.event_name == 'delete' && github.repository == 're-data/dbt-re-data' && github.ref == 'refs/heads/main'
114 |     strategy:
115 |       fail-fast: false
116 |       matrix:
117 |         database: [snowflake, bigquery, redshift]
118 |     steps:        
119 |       - name: Check out
120 |         uses: actions/checkout@v2
121 | 
122 |       - uses: actions/setup-python@v4
123 |         with:
124 |           python-version: ${{ env.PYTHON_VERSION }}
125 | 
126 |       - name: Inject slug/short variables
127 |         uses: rlespinasse/github-slug-action@v3.x
128 | 
129 |       - name: Set the DQ_SCHEMA environment variable
130 |         shell: bash
131 |         run: |
132 |           echo "DQ_SCHEMA=dq_${GITHUB_EVENT_REF_SLUG//[^[:alnum:]]/_}" >> $GITHUB_ENV
133 | 
134 |       - name: Print DQ_SCHEMA
135 |         run: |
136 |           echo $DQ_SCHEMA
137 | 
138 |       - name: Install dependencies and drop branch schema
139 |         working-directory: ./integration_tests
140 |         run: |
141 |           pip install -r requirements.txt
142 |           pip install dbt-${{ matrix.database }}==$DBT_VERSION
143 |           dbt deps
144 |           dbt run-operation drop_all_schemas --args "{ schema_name: ${{ env.DQ_SCHEMA }} }" --profile re_data_${{ matrix.database }} --vars "{ source_schema: ${{ env.DQ_SCHEMA }} }"


--------------------------------------------------------------------------------
/models/alerts/re_data_schema_changes.sql:
--------------------------------------------------------------------------------
  1 | {{
  2 |     config(
  3 |         materialized='incremental',
  4 |         unique_key = 'id',
  5 |         on_schema_change='sync_all_columns',
  6 |     )
  7 | }}
  8 | 
  9 | -- depends_on: {{ ref('re_data_run_started_at') }}
 10 | -- depends_on: {{ ref('re_data_columns_over_time') }}
 11 | -- depends_on: {{ ref('re_data_monitored') }}
 12 | -- depends_on: {{ ref('re_data_selected') }}
 13 | 
 14 | {% if execute and not re_data.in_compile() %}
 15 |     {% set last_data_points %} 
 16 |         select
 17 |             distinct detected_time
 18 |         from {{ ref('re_data_columns_over_time') }}
 19 |         order by
 20 |         detected_time desc limit 2;
 21 |     {% endset %}
 22 | 
 23 |     {% set detected_times = run_query(last_data_points) %}
 24 | 
 25 |     {% set times_list = detected_times.columns[0].values() %}
 26 |     {% set most_recent_time = times_list[0] %}
 27 | 
 28 |     {% if times_list | length > 1 %}
 29 |         {% set prev_most_recent = times_list[1] %}
 30 |     {% else %}
 31 |         {% set prev_most_recent = times_list[0] %}
 32 |     {% endif %}
 33 | {% else %}
 34 |     {% set times_list = () %}
 35 | {% endif %}
 36 | 
 37 | {% if times_list == () %}
 38 |     {{
 39 |         re_data.empty_table_generic([
 40 |             ('id', 'string'),
 41 |             ('table_name', 'string'),
 42 |             ('operation', 'string'),
 43 |             ('column_name', 'string'),
 44 |             ('data_type', 'string'),
 45 |             ('is_nullable', 'boolean'),
 46 |             ('prev_column_name', 'string'),
 47 |             ('prev_data_type', 'string'),
 48 |             ('prev_is_nullable', 'boolean'),
 49 |             ('detected_time', 'timestamp')
 50 |             ])
 51 |     }}
 52 | {% else %}
 53 | 
 54 |     with curr_monitored_schema as (
 55 |         select * from {{ ref('re_data_columns_over_time')}}
 56 |         where detected_time = cast('{{ most_recent_time }}' as {{ timestamp_type() }})
 57 |         and table_name in (
 58 |             select {{ full_table_name('name', 'schema', 'database') }} from {{ ref('re_data_selected')}}
 59 |         )
 60 |     ),
 61 | 
 62 | 
 63 |     prev_monitored_schema as (
 64 |         select * from {{ ref('re_data_columns_over_time')}}
 65 |         where detected_time = cast('{{ prev_most_recent}}' as {{ timestamp_type() }})
 66 |         and table_name in (
 67 |             select {{ full_table_name('name', 'schema', 'database') }} from {{ ref('re_data_selected')}}
 68 |         )
 69 |     ),
 70 | 
 71 |     all_changes as (
 72 |             (
 73 |             select
 74 |                 curr.table_name as table_name,
 75 |                 'type_change' as operation,
 76 |                 curr.column_name as column_name,
 77 |                 curr.data_type as data_type,
 78 |                 curr.is_nullable as is_nullable,
 79 | 
 80 |                 prev.column_name as prev_column_name,
 81 |                 prev.data_type as prev_data_type,
 82 |                 prev.is_nullable as prev_is_nullable
 83 |             
 84 |             from curr_monitored_schema curr inner join prev_monitored_schema prev on (curr.table_name = prev.table_name and curr.column_name = prev.column_name)
 85 |             where
 86 |                 curr.data_type != prev.data_type or 
 87 |                 curr.is_nullable != prev.is_nullable
 88 |             )
 89 | 
 90 |         union all
 91 | 
 92 |         (
 93 | 
 94 |             select
 95 |                 curr.table_name as table_name,
 96 |                 'column_added' as operation,
 97 |                 curr.column_name as column_name,
 98 |                 curr.data_type as data_type,
 99 |                 curr.is_nullable as is_nullable,
100 | 
101 |                 null as prev_column_name,
102 |                 null as prev_data_type,
103 |                 null as prev_is_nullable
104 |             
105 |             from curr_monitored_schema curr left join prev_monitored_schema prev on (curr.table_name = prev.table_name and curr.column_name = prev.column_name)
106 |             where prev.table_name is null and prev.column_name is null
107 |             {# note: when a column is added, make sure we only detect for models that were previously monitored,
108 |             this avoids a situation where a newly monitored model has all its columns detected with 'column_added' operation#}
109 |             and curr.table_name in (
110 |                 select table_name from prev_monitored_schema
111 |             )
112 |         
113 |         )
114 | 
115 |         union all
116 | 
117 |         (
118 | 
119 |             select
120 |                 prev.table_name as table_name,
121 |                 'column_removed' as operation,
122 |                 null as column_name,
123 |                 null as data_type,
124 |                 null as is_nullable,
125 | 
126 |                 prev.column_name as prev_column_name,
127 |                 prev.data_type as prev_data_type,
128 |                 prev.is_nullable as prev_is_nullable
129 |             
130 |             from prev_monitored_schema prev left join curr_monitored_schema curr on (curr.table_name = prev.table_name and curr.column_name = prev.column_name)
131 |             where curr.table_name is null and curr.column_name is null
132 | 
133 |         )
134 |     ),
135 | 
136 |     all_with_time as (
137 |         select
138 |             all_changes.table_name,
139 |             all_changes.operation,
140 |             all_changes.column_name,
141 |             all_changes.data_type,
142 |             all_changes.is_nullable,
143 |             all_changes.prev_column_name,
144 |             all_changes.prev_data_type,
145 |             all_changes.prev_is_nullable,
146 |             cast({{dbt.current_timestamp_backcompat()}} as {{ timestamp_type() }}) as detected_time
147 |         from all_changes
148 |     )
149 | 
150 |     select 
151 |         cast ({{ dbt_utils.generate_surrogate_key([
152 |         'table_name',
153 |         'column_name',
154 |         'detected_time'
155 |         ]) }} as {{ string_type() }} ) as id,
156 |         table_name,
157 |         cast (operation as {{ string_type() }}) as operation,
158 |         column_name,
159 |         data_type,
160 |         is_nullable,
161 |         prev_column_name,
162 |         prev_data_type,
163 |         prev_is_nullable,
164 |         detected_time
165 |     from all_with_time
166 |     
167 | {% endif %}
168 | 


--------------------------------------------------------------------------------
/integration_tests/seeds/monitoring/expected_z_score.csv:
--------------------------------------------------------------------------------
  1 | table_name,column_name,metric,time_window_end,z_score_value,modified_z_score_value,last_value,last_avg,last_stddev,last_median,last_iqr,last_median_absolute_deviation,last_mean_absolute_deviation,interval_length_sec
  2 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__avg_length,2021-05-03 00:00:00,0,0,14105,14105,0,14105,0,0,0,86400
  3 | SAMPLE_TABLE,EVENT_TYPE,distinct_values,2021-05-03 00:00:00,-707,-674,1000,1500,707,1500,500,500,500,86400
  4 | BUY_EVENTS,EVENT_TYPE,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
  5 | SAMPLE_TABLE,EVENT_TYPE,max_length,2021-05-03 00:00:00,-707,-674,3000,3500,707,3500,500,500,500,86400
  6 | SAMPLE_TABLE,EVENT_TYPE,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
  7 | SAMPLE_TABLE,EVENT_TYPE,min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
  8 | SAMPLE_TABLE,EVENT_TYPE,missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
  9 | SAMPLE_TABLE,NULL_VALUE,nulls_percent,2021-05-03 00:00:00,0,0,100000,100000,0,100000,0,0,0,86400
 10 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__avg_length,2021-05-03 00:00:00,0,0,3053,3053,0,3053,0,0,0,86400
 11 | BUY_EVENTS,VALUE1,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 12 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 13 | SAMPLE_TABLE,---,freshness,2021-05-03 00:00:00,0,0,40765000,40765000,0,40765000,0,0,0,86400
 14 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__stddev,2021-05-03 00:00:00,0,0,3028,3028,0,3028,0,0,0,86400
 15 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 16 | BUY_EVENTS,EVENT_TYPE,min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
 17 | SAMPLE_TABLE,VALUE1,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 18 | BUY_EVENTS,VALUE2,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 19 | SAMPLE_TABLE,VALUE1,variance,2021-05-03 00:00:00,707,674,2491667,2425000,94281,2425000,66667,66667,66667,86400
 20 | RE_DATA_SOURCE_TEST_TABLE,---,global__row_count,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400
 21 | BUY_EVENTS,---,my_distinct_table_rows,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400
 22 | SAMPLE_WITHOUT_TIME_FILTER,---,global__row_count,2021-05-03 00:00:00,0,0,19000,19000,0,19000,0,0,0,86400
 23 | SAMPLE_TABLE,EVENT_TYPE,match_regex,2021-05-03 00:00:00,-707,-674,0,500,707,500,500,500,500,86400
 24 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__diff,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400
 25 | BUY_EVENTS,VALUE2,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 26 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__avg,2021-05-03 00:00:00,0,0,5500,5500,0,5500,0,0,0,86400
 27 | SAMPLE_TABLE,VALUE2,min,2021-05-03 00:00:00,0,0,109000,109000,0,109000,0,0,0,86400
 28 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__max_length,2021-05-03 00:00:00,0,0,17000,17000,0,17000,0,0,0,86400
 29 | SAMPLE_WITH_ANOMALY,VALUE1,min,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400
 30 | SAMPLE_WITH_ANOMALY,VALUE2,max,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400
 31 | SAMPLE_WITH_ANOMALY,VALUE2,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 32 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex_percent,2021-05-03 00:00:00,-707,-674,0,12500,17678,12500,12500,12500,12500,86400
 33 | SAMPLE_TABLE,---,distinct_table_rows,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400
 34 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 35 | SAMPLE_WITH_ANOMALY,VALUE2,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 36 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 37 | SAMPLE_TABLE,VALUE2,stddev,2021-05-03 00:00:00,0,0,47975,47975,0,47975,0,0,0,86400
 38 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__min,2021-05-03 00:00:00,0,0,990,990,0,990,0,0,0,86400
 39 | BUY_EVENTS,VALUE2,max,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400
 40 | SAMPLE_WITHOUT_TIME_FILTER,---,global__my_distinct_table_rows,2021-05-03 00:00:00,0,0,19000,19000,0,19000,0,0,0,86400
 41 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__avg_length,2021-05-03 00:00:00,0,0,3900,3900,0,3900,0,0,0,86400
 42 | SAMPLE_WITH_ANOMALY,VALUE1,avg,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400
 43 | SAMPLE_TABLE,VALUE2,variance,2021-05-03 00:00:00,0,0,2301583,2301583,0,2301583,0,0,0,86400
 44 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 45 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 46 | BUY_EVENTS,VALUE1,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 47 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__max,2021-05-03 00:00:00,0,0,4990,4990,0,4990,0,0,0,86400
 48 | SAMPLE_WITH_ANOMALY,---,my_distinct_table_rows,2021-05-03 00:00:00,0,0,12000,12000,0,12000,0,0,0,86400
 49 | BUY_EVENTS,VALUE1,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 50 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__variance,2021-05-03 00:00:00,0,0,2784,2784,0,2784,0,0,0,86400
 51 | SAMPLE_TABLE,EVENT_TYPE,duplicate_values,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400
 52 | SAMPLE_TABLE,NULL_VALUE,nulls_count,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400
 53 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 54 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 55 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__variance,2021-05-03 00:00:00,0,0,9167,9167,0,9167,0,0,0,86400
 56 | SAMPLE_TABLE,EVENT_TYPE,avg_length,2021-05-03 00:00:00,-707,-674,3000,3125,177,3125,125,125,125,86400
 57 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__min,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400
 58 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 59 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__avg,2021-05-03 00:00:00,0,0,3306,3306,0,3306,0,0,0,86400
 60 | BUY_EVENTS,VALUE1,avg,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400
 61 | SAMPLE_WITH_ANOMALY,VALUE2,min,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400
 62 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 63 | SAMPLE_TABLE,EVENT_TYPE,not_match_regex,2021-05-03 00:00:00,-707,-674,0,500,707,500,500,500,500,86400
 64 | BUY_EVENTS,VALUE1,max,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400
 65 | SAMPLE_TABLE,VALUE1,stddev,2021-05-03 00:00:00,707,674,49917,49240,957,49240,677,677,677,86400
 66 | BUY_EVENTS,VALUE1,min,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400
 67 | SAMPLE_WITH_ANOMALY,VALUE2,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 68 | SAMPLE_TABLE,---,my_custom_table_metric,2021-05-03 00:00:00,0,0,1000000,1000000,0,1000000,0,0,0,86400
 69 | BUY_EVENTS,EVENT_TYPE,max_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
 70 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,avg_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
 71 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 72 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 73 | SAMPLE_WITH_ANOMALY,VALUE1,max,2021-05-03 00:00:00,-707,-674,98000,102500,6364,102500,4500,4500,4500,86400
 74 | SAMPLE_TABLE,---,my_distinct_table_rows,2021-05-03 00:00:00,0,0,9000,9000,0,9000,0,0,0,86400
 75 | SAMPLE_TABLE,---,row_count,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400
 76 | SAMPLE_TABLE,VALUE2,max,2021-05-03 00:00:00,0,0,209000,209000,0,209000,0,0,0,86400
 77 | SAMPLE_WITH_ANOMALY,VALUE1,diff,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 78 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 79 | BUY_EVENTS,EVENT_TYPE,avg_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
 80 | SAMPLE_TABLE,VALUE1,diff,2021-05-03 00:00:00,707,674,110000,105000,7071,105000,5000,5000,5000,86400
 81 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 82 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 83 | SAMPLE_TABLE,VALUE1,min,2021-05-03 00:00:00,0,0,100000,100000,0,100000,0,0,0,86400
 84 | SAMPLE_TABLE,EVENT_TYPE,regex_test,2021-05-03 00:00:00,0,0,4000,4000,0,4000,0,0,0,86400
 85 | SAMPLE_TABLE,EVENT_TYPE,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 86 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__min_length,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400
 87 | SAMPLE_WITHOUT_TIME_FILTER,TITLE,global__min_length,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400
 88 | SAMPLE_WITH_ANOMALY,VALUE1,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 89 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__max,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400
 90 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 91 | SAMPLE_TABLE,VALUE2,nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
 92 | SAMPLE_WITH_ANOMALY,VALUE2,avg,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400
 93 | BUY_EVENTS,---,freshness,2021-05-03 00:00:00,-707,-674,41065000,41186500,171827,41186500,121500,121500,121500,86400
 94 | BUY_EVENTS,VALUE2,avg,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400
 95 | BUY_EVENTS,---,row_count,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400
 96 | SAMPLE_WITH_ANOMALY,---,freshness,2021-05-03 00:00:00,-707,-674,41065000,41186500,171827,41186500,121500,121500,121500,86400
 97 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__max_length,2021-05-03 00:00:00,0,0,5000,5000,0,5000,0,0,0,86400
 98 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
 99 | SAMPLE_TABLE,EVENT_TYPE,match_regex_percent,2021-05-03 00:00:00,-707,-674,0,12500,17678,12500,12500,12500,12500,86400
100 | SAMPLE_WITH_ANOMALY,EVENT_TYPE,max_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
101 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__stddev,2021-05-03 00:00:00,0,0,1668,1668,0,1668,0,0,0,86400
102 | SAMPLE_TABLE,VALUE1,max,2021-05-03 00:00:00,707,674,210000,205000,7071,205000,5000,5000,5000,86400
103 | SAMPLE_TABLE,VALUE2,avg,2021-05-03 00:00:00,0,0,180750,180750,0,180750,0,0,0,86400
104 | SAMPLE_WITH_ANOMALY,VALUE1,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
105 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__min_length,2021-05-03 00:00:00,0,0,3000,3000,0,3000,0,0,0,86400
106 | SAMPLE_TABLE,VALUE1,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
107 | BUY_EVENTS,EVENT_TYPE,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
108 | BUY_EVENTS,EVENT_TYPE,missing_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
109 | BUY_EVENTS,EVENT_TYPE,missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
110 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
111 | BUY_EVENTS,VALUE2,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
112 | RE_DATA_SOURCE_TEST_TABLE,NUMBER,global__diff,2021-05-03 00:00:00,0,0,9000,9000,0,9000,0,0,0,86400
113 | SAMPLE_WITHOUT_TIME_FILTER,RATING,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
114 | BUY_EVENTS,VALUE2,min,2021-05-03 00:00:00,707,674,205000,202500,3536,202500,2500,2500,2500,86400
115 | RE_DATA_SOURCE_TEST_TABLE,DESCRIPTION,global__max_length,2021-05-03 00:00:00,0,0,5000,5000,0,5000,0,0,0,86400
116 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
117 | SAMPLE_TABLE,VALUE2,nulls_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
118 | SAMPLE_TABLE,EVENT_TYPE,duplicate_rows,2021-05-03 00:00:00,707,674,4000,3500,707,3500,500,500,500,86400
119 | RE_DATA_SOURCE_TEST_TABLE,---,global__my_distinct_table_rows,2021-05-03 00:00:00,0,0,10000,10000,0,10000,0,0,0,86400
120 | SAMPLE_TABLE,VALUE2,diff,2021-05-03 00:00:00,0,0,100000,100000,0,100000,0,0,0,86400
121 | SAMPLE_WITHOUT_TIME_FILTER,RENTAL_RATE,global__nulls_percent,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
122 | SAMPLE_WITH_ANOMALY,---,row_count,2021-05-03 00:00:00,0,0,1000,1000,0,1000,0,0,0,86400
123 | SAMPLE_TABLE,EVENT_TYPE,unique_rows,2021-05-03 00:00:00,-707,-674,0,500,707,500,500,500,500,86400
124 | SAMPLE_TABLE,VALUE1,avg,2021-05-03 00:00:00,707,674,142500,135000,10607,135000,7500,7500,7500,86400
125 | SAMPLE_TABLE,EVENT_TYPE,missing_count,2021-05-03 00:00:00,0,0,0,0,0,0,0,0,0,86400
126 | 


--------------------------------------------------------------------------------