├── analyses
    └── .gitkeep
├── macros
    ├── .gitkeep
    ├── exceptions
    │   ├── .gitkeep
    │   └── adapter_missing_exception.sql
    ├── utils
    │   ├── set_except.sql
    │   ├── set_union.sql
    │   ├── cast_timestamp.sql
    │   ├── array_construct.sql
    │   ├── array_contains.sql
    │   ├── array_append.sql
    │   ├── array_agg.sql
    │   └── sql_values.sql
    ├── enforce_graph_structure.sql
    ├── largest_connected_subgraphs.sql
    └── connect_ordered_graph.sql
├── seeds
    └── .gitkeep
├── snapshots
    └── .gitkeep
├── integration_tests
    ├── db
    │   └── .gitkeep
    ├── requirements
    │   ├── requirements_postgres.txt
    │   ├── requirements_duckdb.txt
    │   └── requirements_lint.txt
    ├── bin
    │   ├── start-postgres
    │   ├── stop-postgres
    │   ├── destroy-postgres
    │   └── setup-postgres
    ├── .sqlfluffignore
    ├── data
    │   ├── test_largest_connected_subgraphs
    │   │   ├── test_largest_connected_subgraphs_nd_data.csv
    │   │   ├── test_largest_connected_subgraphs_gid_nd_data.csv
    │   │   ├── test_largest_connected_subgraphs_2_sg_ne_data.csv
    │   │   ├── test_largest_connected_subgraphs_3_sg_ne_data.csv
    │   │   ├── test_largest_connected_subgraphs_4_sg_data.csv
    │   │   ├── test_largest_connected_subgraphs_gid_data.csv
    │   │   ├── test_largest_connected_subgraphs_1_sg_data.csv
    │   │   └── test_largest_connected_subgraphs_gid_3_sg_data.csv
    │   ├── test_graph_is_connected
    │   │   ├── test_graph_is_connected_data.csv
    │   │   └── test_graph_is_connected_gid_data.csv
    │   └── test_connect_ordered_graph
    │   │   ├── test_connect_ordered_graph_2_sg_numeric_data.csv
    │   │   ├── test_connect_ordered_graph_2_sg_numeric_tangled_data.csv
    │   │   ├── test_connect_ordered_graph_2_sg_date_data.csv
    │   │   ├── test_connect_ordered_graph_3_sg_date_data.csv
    │   │   └── test_connect_ordered_graph_4_sg_timestamp_data.csv
    ├── models
    │   ├── test_graph_is_connected
    │   │   ├── test_graph_is_connected.sql
    │   │   ├── test_graph_is_connected_gid.sql
    │   │   └── test_graph_is_connected.yml
    │   ├── README.md
    │   ├── test_largest_connected_subgraphs
    │   │   ├── test_largest_connected_subgraphs_nd.sql
    │   │   ├── test_largest_connected_subgraphs_3_sg_ne.sql
    │   │   ├── test_largest_connected_subgraphs_1_sg.sql
    │   │   ├── test_largest_connected_subgraphs_gid_nd.sql
    │   │   ├── test_largest_connected_subgraphs_2_sg_ne.sql
    │   │   ├── test_largest_connected_subgraphs_4_sg.sql
    │   │   ├── test_largest_connected_subgraphs_gid.sql
    │   │   ├── test_largest_connected_subgraphs_gid_3_sg.sql
    │   │   └── test_largest_connected_subgraphs.yml
    │   └── test_connect_ordered_graph
    │   │   ├── test_connect_ordered_graph.yml
    │   │   ├── test_connect_ordered_graph_2_sg_numeric.sql
    │   │   ├── test_connect_ordered_graph_2_sg_date.sql
    │   │   ├── test_connect_ordered_graph_2_sg_numeric_tangled.sql
    │   │   ├── test_connect_ordered_graph_3_sg_date.sql
    │   │   └── test_connect_ordered_graph_4_sg_timestamp.sql
    ├── packages.yml
    ├── dbt_project.yml
    ├── .sqlfluff
    ├── ci_profiles
    │   └── profiles.yml
    └── macros
    │   └── cte_difference.sql
├── packages.yml
├── .gitignore
├── dbt_project.yml
├── tests
    └── generic
    │   └── graph_is_connected.sql
├── .github
    └── workflows
    │   └── ci.yml
├── LICENSE
└── README.md


/analyses/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macros/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/seeds/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/snapshots/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/macros/exceptions/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/integration_tests/db/.gitkeep:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/packages.yml:
--------------------------------------------------------------------------------
1 | packages: []
2 | 


--------------------------------------------------------------------------------
/integration_tests/requirements/requirements_postgres.txt:
--------------------------------------------------------------------------------
1 | dbt-postgres>=1.3
2 | 


--------------------------------------------------------------------------------
/integration_tests/bin/start-postgres:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pg_ctl start -D db/config;
3 | 


--------------------------------------------------------------------------------
/integration_tests/bin/stop-postgres:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pg_ctl stop -D db/config;
3 | 


--------------------------------------------------------------------------------
/integration_tests/.sqlfluffignore:
--------------------------------------------------------------------------------
1 | dbt_modules/
2 | dbt_packages/
3 | ci_venv/
4 | target/
5 | 


--------------------------------------------------------------------------------
/integration_tests/requirements/requirements_duckdb.txt:
--------------------------------------------------------------------------------
1 | dbt-duckdb>=1.5,<1.6
2 | duckdb >=0.8.1.dev0
3 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_nd_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2
2 | 


--------------------------------------------------------------------------------
/integration_tests/requirements/requirements_lint.txt:
--------------------------------------------------------------------------------
1 | dbt-duckdb>=1.5,<1.6
2 | sqlfluff>=2.1
3 | sqlfluff-templater-dbt>=2.1
4 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_gid_nd_data.csv:
--------------------------------------------------------------------------------
1 | graph_id,id,vertex_1,vertex_2
2 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_graph_is_connected/test_graph_is_connected.sql:
--------------------------------------------------------------------------------
1 | select * from {{ ref('test_graph_is_connected_data') }}
2 | 


--------------------------------------------------------------------------------
/integration_tests/bin/destroy-postgres:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pg_ctl stop -D db/config > /dev/null 2>&1;
3 | rm -rf db/config > /dev/null 2>&1;
4 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_2_sg_ne_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2
2 | 1,A,
3 | 2,B,
4 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_graph_is_connected/test_graph_is_connected_gid.sql:
--------------------------------------------------------------------------------
1 | select * from {{ ref('test_graph_is_connected_gid_data') }}
2 | 


--------------------------------------------------------------------------------
/integration_tests/packages.yml:
--------------------------------------------------------------------------------
1 | packages:
2 |   - local: ../
3 |   - package: calogica/dbt_expectations
4 |     version: [">=0.5.0", "<0.6.0"]
5 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_graph_is_connected/test_graph_is_connected_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2
2 | 1,A,B
3 | 2,B,C
4 | 3,C,D
5 | 4,B,D
6 | 5,C,A
7 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_3_sg_ne_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2
2 | 1,A,
3 | 2,B,
4 | 3,,C
5 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_4_sg_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2
2 | 1,A,B
3 | 2,B,C
4 | 3,C,D
5 | 4,E,
6 | 5,E,F
7 | 6,G,
8 | 7,H,I
9 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_connect_ordered_graph/test_connect_ordered_graph_2_sg_numeric_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2,order_int
2 | 1,A,B,1
3 | 2,B,C,2
4 | 3,C,D,3
5 | 4,E,F,4
6 | 5,F,G,5
7 | 6,G,H,6
8 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_connect_ordered_graph/test_connect_ordered_graph_2_sg_numeric_tangled_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2,order_int
2 | 1,A,B,1
3 | 2,B,C,5
4 | 3,C,D,2
5 | 4,E,F,6
6 | 5,F,G,3
7 | 6,G,H,5
8 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_gid_data.csv:
--------------------------------------------------------------------------------
1 | graph_id,id,vertex_1,vertex_2
2 | 1,1,A,B
3 | 1,2,B,C
4 | 1,3,C,D
5 | 1,4,E,
6 | 2,1,A,B
7 | 2,2,B,C
8 | 2,3,C,D
9 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_1_sg_data.csv:
--------------------------------------------------------------------------------
 1 | id,vertex_1,vertex_2
 2 | 1,A,B
 3 | 2,B,C
 4 | 3,C,D
 5 | 4,B,D
 6 | 5,A,C
 7 | 6,B,E
 8 | 7,E,D
 9 | 8,A,
10 | 9,E,
11 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_connect_ordered_graph/test_connect_ordered_graph_2_sg_date_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2,order_date
2 | 1,A,B,2022-01-01
3 | 2,B,C,2022-01-03
4 | 3,C,D,2022-01-05
5 | 4,E,F,2022-01-08
6 | 5,F,G,2022-01-16
7 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_connect_ordered_graph/test_connect_ordered_graph_3_sg_date_data.csv:
--------------------------------------------------------------------------------
1 | id,vertex_1,vertex_2,order_date
2 | 1,A,B,2022-01-01
3 | 2,B,C,2022-01-03
4 | 3,C,D,2022-01-05
5 | 4,E,F,2022-01-08
6 | 5,F,G,2022-01-16
7 | 6,H,I,2022-01-27
8 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_graph_is_connected/test_graph_is_connected_gid_data.csv:
--------------------------------------------------------------------------------
 1 | graph_id,id,vertex_1,vertex_2
 2 | 1,1,A,B
 3 | 1,2,B,C
 4 | 1,3,C,D
 5 | 1,4,C,A
 6 | 2,1,A,
 7 | 2,2,A,B
 8 | 2,3,,B
 9 | 3,1,A,B
10 | 3,2,B,A
11 | 3,3,B,C
12 | 3,4,A,
13 | 3,5,C,D
14 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # DBT
 2 | target/
 3 | dbt_packages/
 4 | logs/
 5 | # CI CONFIG
 6 | integration_tests/ci_venv/
 7 | integration_tests/config/
 8 | integration_tests/ci_profiles/.user.yml
 9 | integration_tests/ci_profiles/*.json
10 | # CI DBs
11 | integration_tests/db/*
12 | !integration_tests/db/.gitkeep
13 | 


--------------------------------------------------------------------------------
/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | name: 'dbt_graph_theory'
 2 | version: '0.2.0'
 3 | config-version: 2
 4 | require-dbt-version: [">=1.0.0", "<2.0.0"]
 5 | 
 6 | test-paths: ["tests"]
 7 | macro-paths: ["macros"]
 8 | target-path: "target"
 9 | clean-targets: ["target", "dbt_modules", "dbt_packages"]
10 | log-path: "logs"
11 | 


--------------------------------------------------------------------------------
/macros/utils/set_except.sql:
--------------------------------------------------------------------------------
 1 | {% macro set_except() %}
 2 |     {{ return(adapter.dispatch("set_except", macro_namespace="dbt_graph_theory")()) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro bigquery__set_except() %}
 6 |     except distinct
 7 | {% endmacro %}
 8 | 
 9 | {% macro default__set_except() %}
10 |     except
11 | {% endmacro %}
12 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_largest_connected_subgraphs/test_largest_connected_subgraphs_gid_3_sg_data.csv:
--------------------------------------------------------------------------------
 1 | graph_id,id,vertex_1,vertex_2
 2 | 1,1,A,B
 3 | 1,2,B,A
 4 | 1,3,A,C
 5 | 1,4,C,D
 6 | 1,5,D,A
 7 | 1,6,E,F
 8 | 1,7,F,G
 9 | 1,8,G,E
10 | 1,9,F,H
11 | 1,10,I,J
12 | 2,1,A,B
13 | 2,2,B,C
14 | 2,3,D,
15 | 2,4,E,F
16 | 2,5,F,G
17 | 2,6,G,E
18 | 2,7,F,H
19 | 


--------------------------------------------------------------------------------
/integration_tests/data/test_connect_ordered_graph/test_connect_ordered_graph_4_sg_timestamp_data.csv:
--------------------------------------------------------------------------------
 1 | id,vertex_1,vertex_2,order_time
 2 | 1,A,B,2022-01-01 10:26:45
 3 | 2,B,C,2022-01-03 15:47:54
 4 | 3,C,D,2022-01-05 23:16:16
 5 | 4,E,F,2022-01-08 16:06:26
 6 | 5,F,G,2022-01-16 04:12:34
 7 | 6,H,I,2022-01-27 18:27:15
 8 | 7,K,L,2022-01-29 15:23:46
 9 | 8,L,M,2022-01-27 19:02:05
10 | 


--------------------------------------------------------------------------------
/integration_tests/bin/setup-postgres:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | pg_ctl stop -D db/config > /dev/null 2>&1;
3 | rm -rf db/config > /dev/null 2>&1;
4 | pg_ctl init -D db/config > /dev/null;
5 | pg_ctl start -D db/config  > /dev/null
6 | psql -d postgres -c "create user ci_user with login superuser password 'ci'" > /dev/null;
7 | psql -d postgres -U ci_user  -c "create database ci_db" > /dev/null;
8 | 


--------------------------------------------------------------------------------
/macros/utils/set_union.sql:
--------------------------------------------------------------------------------
 1 | {% macro set_union(distinct=true) %}
 2 |     {{ return(adapter.dispatch("set_union", macro_namespace="dbt_graph_theory")(distinct)) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro bigquery__set_union(distinct) %}
 6 |     {{ 'union distinct' if distinct else 'union all' }}
 7 | {% endmacro %}
 8 | 
 9 | {% macro default__set_union(distinct) %}
10 |     {{ 'union' if distinct else 'union all' }}
11 | {% endmacro %}
12 | 


--------------------------------------------------------------------------------
/integration_tests/dbt_project.yml:
--------------------------------------------------------------------------------
 1 | name: "dbt_graph_theory_integration_tests"
 2 | version: "1.0"
 3 | 
 4 | profile: "integration_tests"
 5 | 
 6 | config-version: 2
 7 | 
 8 | model-paths: ["models"]
 9 | test-paths: ["tests"]
10 | seed-paths: ["data"]
11 | macro-paths: ["macros"]
12 | 
13 | target-path: "target"
14 | clean-targets: ["target", "dbt_modules", "dbt_packages"]
15 | 
16 | models:
17 |   dbt_graph_theory_integration_tests:
18 |     materialized: table
19 | 


--------------------------------------------------------------------------------
/integration_tests/.sqlfluff:
--------------------------------------------------------------------------------
 1 | [sqlfluff]
 2 | templater = dbt
 3 | dialect = duckdb
 4 | exclude_rules = ST03, CV11
 5 | max_line_length = 120
 6 | 
 7 | [sqlfluff:templater:dbt]
 8 | profiles_dir = ci_profiles
 9 | profile = integration_tests
10 | target = duckdb
11 | 
12 | [sqlflull:indentation]
13 | tab_space_size = 4
14 | 
15 | [sqlfluff:rules]
16 | indent_unit = space
17 | comma_style = trailing
18 | 
19 | [sqlfluff:rules:capitalisation.keywords]
20 | capitalisation_policy = lower
21 | [sqlfluff:rules:layout.long_lines]
22 | ignore_comment_lines = true
23 | 


--------------------------------------------------------------------------------
/macros/exceptions/adapter_missing_exception.sql:
--------------------------------------------------------------------------------
 1 | {% macro adapter_missing_exception() %}
 2 | 
 3 |   {% set supported_adapters = [
 4 |   "dbt_postgres",
 5 |   "dbt_snowflake",
 6 |   "dbt_bigquery"
 7 | ] %}
 8 | 
 9 |   {{- exceptions.raise_compiler_error(
10 |     "This package only supports the following adapters:\n" ~ 
11 |     "- " ~ supported_adapters | join(",\n- ") ~ "\n" ~ 
12 |     "To increase adapter support, please submit an issue or a pull request against https://github.com/jpmmcneill/dbt-graph-theory "
13 |     ) -}}
14 | {% endmacro %}
15 | 


--------------------------------------------------------------------------------
/integration_tests/models/README.md:
--------------------------------------------------------------------------------
 1 | # Integration Test Models
 2 | 
 3 | Because of postgres limitations, model and seed names need to be kept below 51 characters (63 characters is the postgres limit, and `dbt` needs to be able to add the `__dbt_backup` suffix).
 4 | 
 5 | For this reason, model and seed names might sometimes have abbreviations:
 6 | 
 7 | ```yaml
 8 | <n>_sg: N subgraphs (the number of subgraphs in the given graph) 
 9 | ne: No edges (ie. no vertices are connected)
10 | gid: Graph id (ie. the model has a graph_id defined)
11 | nd: No data (ie. this is an empty table)
12 | ```
13 | 


--------------------------------------------------------------------------------
/integration_tests/ci_profiles/profiles.yml:
--------------------------------------------------------------------------------
 1 | integration_tests:
 2 |   target: postgres
 3 |   outputs:
 4 |     postgres:
 5 |       type: postgres
 6 |       host: localhost
 7 |       user: ci_user
 8 |       password: "ci"
 9 |       port: 5432
10 |       dbname: ci_db
11 |       schema: ci_schema
12 |       threads: 2
13 |     bigquery:
14 |       type: bigquery
15 |       method: service-account
16 |       project: james-dtype-test
17 |       dataset: james-dtype-test
18 |       threads: 2
19 |       keyfile: bigquery_secret.json
20 |     duckdb:
21 |       type: duckdb
22 |       path: 'db/ci_db.duckdb'
23 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_nd.sql:
--------------------------------------------------------------------------------
 1 | with recursive computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_nd_data')
 4 |     ) }}
 5 | ),
 6 | 
 7 | required as (
 8 |     select v.* from (
 9 |         values
10 |         (null::{{ type_string() }}, null::{{ type_string() }}, array[null])
11 |     ) as v (vertex, subgraph_id, subgraph_members)
12 |     where false
13 | )
14 | 
15 | select * from {{ cte_difference(
16 |     'computed',
17 |     'required',
18 |     fields=["vertex", "subgraph_id", "subgraph_members"]
19 | ) }}
20 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_3_sg_ne.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_3_sg_ne_data')
 4 |     ) }}
 5 | ),
 6 | 
 7 | required as (
 8 |     select v.* from (
 9 |         values
10 |         ('A', '1', array['A']),
11 |         ('B', '2', array['B']),
12 |         ('C', '3', array['C'])
13 |     ) as v (vertex, subgraph_id, subgraph_members)
14 | )
15 | 
16 | select * from {{ cte_difference(
17 |     'computed',
18 |     'required',
19 |     fields=["vertex", "subgraph_id", "subgraph_members"]
20 | ) }}
21 | 


--------------------------------------------------------------------------------
/integration_tests/macros/cte_difference.sql:
--------------------------------------------------------------------------------
 1 | {% macro cte_difference(cte_1, cte_2, fields=[]) %}
 2 | (
 3 |     (
 4 |         select '{{cte_1}}' as _data_location, {{ fields|join(', ') }} from {{ cte_1 }}
 5 |         {{ dbt_graph_theory.set_except() }}
 6 |         select '{{cte_1}}' as _data_location, {{ fields|join(', ') }} from {{ cte_2 }}
 7 |     )
 8 |     {{ dbt_graph_theory.set_union(distinct=true) }}
 9 |     (
10 |         select '{{cte_2}}' as _data_location, {{ fields|join(', ') }} from {{ cte_2 }}
11 |         {{ dbt_graph_theory.set_except() }}
12 |         select '{{cte_2}}' as _data_location, {{ fields|join(', ') }} from {{ cte_1 }}
13 |     )
14 | ) as diff
15 | {% endmacro %}
16 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_graph_is_connected/test_graph_is_connected.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: test_graph_is_connected
 5 |     description: Unit test (of a connected graph) to validate that test_graph_is_connected passes
 6 |     tests:
 7 |       - dbt_graph_theory.graph_is_connected:
 8 |           edge_id: id
 9 |           vertex_1: vertex_1
10 |           vertex_2: vertex_2
11 |   - name: test_graph_is_connected_gid
12 |     description: Unit test (of 3 connected graphs with graph_id defined) to validate that test_graph_is_connected passes
13 |     tests:
14 |       - dbt_graph_theory.graph_is_connected:
15 |           edge_id: id
16 |           vertex_1: vertex_1
17 |           vertex_2: vertex_2
18 | 


--------------------------------------------------------------------------------
/macros/utils/cast_timestamp.sql:
--------------------------------------------------------------------------------
 1 | {% macro cast_timestamp(field) %}
 2 |     {{ return(adapter.dispatch("cast_timestamp", macro_namespace="dbt_graph_theory")(field)) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro snowflake__cast_timestamp(field) %}
 6 |     cast({{ field }} as timestamp_ntz)
 7 | {% endmacro %}
 8 | 
 9 | {% macro postgres__cast_timestamp(field) %}
10 |     cast({{ field }} as timestamp)
11 | {% endmacro %}
12 | 
13 | {% macro bigquery__cast_timestamp(field) %}
14 |     timestamp({{ field }})
15 | {% endmacro %}
16 | 
17 | {% macro duckdb__cast_timestamp(field) %}
18 |     cast({{ field }} as timestamp)
19 | {% endmacro %}
20 | 
21 | {% macro default__cast_timestamp(field) %}
22 |     {{ dbt_graph_theory.adapter_missing_exception() }}
23 | {% endmacro %}
24 | 


--------------------------------------------------------------------------------
/tests/generic/graph_is_connected.sql:
--------------------------------------------------------------------------------
 1 | {% test graph_is_connected(
 2 |     model,
 3 |     edge_id,
 4 |     vertex_1,
 5 |     vertex_2,
 6 |     graph_id=none
 7 | ) %}
 8 | 
 9 | with connected_subgraphs as (
10 |     {{ dbt_graph_theory.largest_connected_subgraphs(
11 |         input=model,
12 |         edge_id=edge_id,
13 |         vertex_1=vertex_1,
14 |         vertex_2=vertex_2,
15 |         graph_id=graph_id
16 |     ) }}
17 | ),
18 | 
19 | subgraphs_per_graph as (
20 |     select
21 |         {{ 'graph_id,' if graph_id }}
22 |         count(distinct subgraph_id) as num_subgraphs
23 |     from
24 |         connected_subgraphs
25 |     {{ 'group by graph_id' if graph_id }}
26 | )
27 | 
28 | select * from subgraphs_per_graph
29 | where num_subgraphs != 1
30 | 
31 | {% endtest %}
32 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_1_sg.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_1_sg_data')
 4 |     ) }}
 5 | ),
 6 | 
 7 | required as (
 8 |     select v.* from (
 9 |         values
10 |         ('A', '1', array['A', 'B', 'C', 'D', 'E']),
11 |         ('B', '1', array['A', 'B', 'C', 'D', 'E']),
12 |         ('C', '1', array['A', 'B', 'C', 'D', 'E']),
13 |         ('D', '1', array['A', 'B', 'C', 'D', 'E']),
14 |         ('E', '1', array['A', 'B', 'C', 'D', 'E'])
15 |     ) as v (vertex, subgraph_id, subgraph_members)
16 | )
17 | 
18 | select * from {{ cte_difference(
19 |     'computed',
20 |     'required',
21 |     fields=["vertex", "subgraph_id", "subgraph_members"]
22 | ) }}
23 | 


--------------------------------------------------------------------------------
/macros/utils/array_construct.sql:
--------------------------------------------------------------------------------
 1 | {% macro array_construct(components) %}
 2 |     {{ return(adapter.dispatch("array_construct", macro_namespace="dbt_graph_theory")(components)) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro snowflake__array_construct(components) %}
 6 |     array_construct({{ components|join(",") }})
 7 | {% endmacro %}
 8 | 
 9 | {% macro postgres__array_construct(components) %}
10 |     array[{{ components|join(",") }}]
11 | {% endmacro %}
12 | 
13 | {% macro bigquery__array_construct(components) %}
14 |     [{{ components|join(",") }}]
15 | {% endmacro %}
16 | 
17 | {% macro duckdb__array_construct(components) %}
18 |     list_value({{ components|join(",") }})
19 | {% endmacro %}
20 | 
21 | {% macro default__array_construct(components) %}
22 |     {{ dbt_graph_theory.adapter_missing_exception() }}
23 | {% endmacro %}
24 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_gid_nd.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_gid_nd_data'),
 4 |         graph_id='graph_id'
 5 |     ) }}
 6 | ),
 7 | 
 8 | required as (
 9 |     select v.* from (
10 |         values
11 |         (
12 |             cast(null as {{ type_string() }}),
13 |             cast(null as {{ type_string() }}),
14 |             cast(null as {{ type_string() }}),
15 |             array[null]
16 |         )
17 |     ) as v (graph_id, vertex, subgraph_id, subgraph_members)
18 |     where false
19 | )
20 | 
21 | select * from {{ cte_difference(
22 |     'computed',
23 |     'required',
24 |     fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"]
25 | ) }}
26 | 


--------------------------------------------------------------------------------
/macros/utils/array_contains.sql:
--------------------------------------------------------------------------------
 1 | {% macro array_contains(array, value) %}
 2 |     {{ return(adapter.dispatch("array_contains", macro_namespace="dbt_graph_theory")(array, value)) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro snowflake__array_contains(array, value) %}
 6 |     array_contains(cast({{ value }} as variant), {{ array }})
 7 | {% endmacro %}
 8 | 
 9 | {% macro postgres__array_contains(array, value) %}
10 |     {{ value }} = any({{ array }})
11 | {% endmacro %}
12 | 
13 | {% macro bigquery__array_contains(array, value) %}
14 |     ({{ value }} in unnest({{array}}))
15 | {% endmacro %}
16 | 
17 | {% macro duckdb__array_contains(array, value) %}
18 |     list_contains({{ array }}, {{ value }})
19 | {% endmacro %}
20 | 
21 | {% macro default__array_contains(array, value) %}
22 |     {{ dbt_graph_theory.adapter_missing_exception() }}
23 | {% endmacro %}
24 | 


--------------------------------------------------------------------------------
/macros/utils/array_append.sql:
--------------------------------------------------------------------------------
 1 | {% macro array_append(array, new_value) %}
 2 |     {{ return(adapter.dispatch("array_append", macro_namespace="dbt_graph_theory")(array, new_value)) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro snowflake__array_append(array, new_value) %}
 6 |     array_append({{ array }}, {{ new_value }})
 7 | {% endmacro %}
 8 | 
 9 | {% macro postgres__array_append(array, new_value) %}
10 |     array_append({{ array }}, {{ new_value }})
11 | {% endmacro %}
12 | 
13 | {% macro bigquery__array_append(array, new_value) %}
14 |     array_concat({{ array }}, [{{ new_value }}])
15 | {% endmacro %}
16 | 
17 | {% macro duckdb__array_append(array, new_value) %}
18 |     list_append({{ array }}, {{ new_value }})
19 | {% endmacro %}
20 | 
21 | {% macro default__array_append(array, new_value) %}
22 |     {{ dbt_graph_theory.adapter_missing_exception() }}
23 | {% endmacro %}
24 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_connect_ordered_graph/test_connect_ordered_graph.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | 
 3 | models:
 4 |   - name: test_connect_ordered_graph_2_sg_date
 5 |     description: Unit test checking a situation with 2 subgraphs of an ordered (date) graph being connected.
 6 |     tests:
 7 |       - dbt_expectations.expect_table_row_count_to_equal:
 8 |           value: 0
 9 |   - name: test_connect_ordered_graph_3_sg_date
10 |     description: Unit test checking a situation with 3 subgraphs of an ordered (date) graph being connected.
11 |     tests:
12 |       - dbt_expectations.expect_table_row_count_to_equal:
13 |           value: 0
14 |   - name: test_connect_ordered_graph_2_sg_numeric
15 |     description: Unit test checking a situation with 2 subgraphs of an ordered (numeric) graph being connected.
16 |     tests:
17 |       - dbt_expectations.expect_table_row_count_to_equal:
18 |           value: 0
19 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_2_sg_ne.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_2_sg_ne_data')
 4 |     ) }}
 5 | ),
 6 | 
 7 | -- recast because vertex_2 is all null in seed data, interpreted as int dtype
 8 | recast_computed as (
 9 |     select
10 |         cast(vertex as {{ type_string() }}) as vertex,
11 |         subgraph_id,
12 |         subgraph_members
13 |     from
14 |         computed
15 | ),
16 | 
17 | required as (
18 |     select v.* from (
19 |         values
20 |         ('A', '1', array['A']),
21 |         ('B', '2', array['B'])
22 |     ) as v (vertex, subgraph_id, subgraph_members)
23 | )
24 | 
25 | select * from {{ cte_difference(
26 |     'recast_computed',
27 |     'required',
28 |     fields=["vertex", "subgraph_id", "subgraph_members"]
29 | ) }}
30 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_4_sg.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_4_sg_data')
 4 |     ) }}
 5 | ),
 6 | 
 7 | required as (
 8 |     select v.* from (
 9 |         values
10 |         ('A', '1', array['A', 'B', 'C', 'D']),
11 |         ('B', '1', array['A', 'B', 'C', 'D']),
12 |         ('C', '1', array['A', 'B', 'C', 'D']),
13 |         ('D', '1', array['A', 'B', 'C', 'D']),
14 |         ('E', '2', array['E', 'F']),
15 |         ('F', '2', array['E', 'F']),
16 |         ('G', '3', array['G']),
17 |         ('H', '4', array['H', 'I']),
18 |         ('I', '4', array['H', 'I'])
19 |     ) as v (vertex, subgraph_id, subgraph_members)
20 | )
21 | 
22 | select * from {{ cte_difference(
23 |     'computed',
24 |     'required',
25 |     fields=["vertex", "subgraph_id", "subgraph_members"]
26 | ) }}
27 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_connect_ordered_graph/test_connect_ordered_graph_2_sg_numeric.sql:
--------------------------------------------------------------------------------
 1 | with recast as (
 2 |     select
 3 |         id,
 4 |         vertex_1,
 5 |         vertex_2,
 6 |         order_int
 7 |     from {{ ref('test_connect_ordered_graph_2_sg_numeric_data') }}
 8 | ),
 9 | 
10 | computed as (
11 |     {{ dbt_graph_theory.connect_ordered_graph(
12 |         input='recast',
13 |         edge_id='id',
14 |         vertex_1='vertex_1',
15 |         vertex_2='vertex_2',
16 |         ordering={"order_int": "numeric"}
17 |     ) }}
18 | ),
19 | 
20 | required as (
21 |     select v.* from (
22 |         values
23 |         ('1', 'A', 'B', 1),
24 |         ('2', 'B', 'C', 2),
25 |         ('3', 'C', 'D', 3),
26 |         ('4', 'E', 'F', 4),
27 |         ('5', 'F', 'G', 5),
28 |         ('6', 'G', 'H', 6),
29 |         ('inserted_edge_1', 'D', 'E', 3.5)
30 |     ) as v (id, vertex_1, vertex_2, order_int)
31 | )
32 | 
33 | select * from {{ cte_difference(
34 |     'computed',
35 |     'required',
36 |     fields=["id", "vertex_1", "vertex_2", "order_int"]
37 | ) }}
38 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_gid.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_gid_data'),
 4 |         graph_id='graph_id'
 5 |     ) }}
 6 | ),
 7 | 
 8 | required as (
 9 |     select v.* from (
10 |         values
11 |         ('1', 'A', '1__1', array['A', 'B', 'C', 'D']),
12 |         ('1', 'B', '1__1', array['A', 'B', 'C', 'D']),
13 |         ('1', 'C', '1__1', array['A', 'B', 'C', 'D']),
14 |         ('1', 'D', '1__1', array['A', 'B', 'C', 'D']),
15 |         ('1', 'E', '1__2', array['E']),
16 |         ('2', 'A', '2__1', array['A', 'B', 'C', 'D']),
17 |         ('2', 'B', '2__1', array['A', 'B', 'C', 'D']),
18 |         ('2', 'C', '2__1', array['A', 'B', 'C', 'D']),
19 |         ('2', 'D', '2__1', array['A', 'B', 'C', 'D'])
20 |     ) as v (graph_id, vertex, subgraph_id, subgraph_members)
21 | )
22 | 
23 | select * from {{ cte_difference(
24 |     'computed',
25 |     'required',
26 |     fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"]
27 | ) }}
28 | 


--------------------------------------------------------------------------------
/macros/enforce_graph_structure.sql:
--------------------------------------------------------------------------------
 1 | {% macro enforce_graph_structure(
 2 |     input,
 3 |     edge_id='id',
 4 |     vertex_1='vertex_1',
 5 |     vertex_2='vertex_2',
 6 |     graph_id=none
 7 | ) %}
 8 |     {#
 9 |         This macro takes a table and enforces that it follows the graph table structure.
10 | 
11 |         Parameters:
12 |         input (string or a ref / source): The input model or CTE that follows the structure above.
13 |         edge_id (string): The edge_id field of the given input.
14 |         vertex_1 (string): The vertex_1 field of the given input.
15 |         vertex_2 (string): The vertex_2 field of the given input.
16 |         graph_id (string, Optional, default = None): The (optional) graph_di field of the given input.
17 |     #}
18 | 
19 | select
20 |     cast({{ edge_id }} as {{ type_string() }}) as {{ edge_id }},
21 |     {{ 'cast(' ~ graph_id ~ ' as ' ~ type_string() ~ ') as ' ~ graph_id ~ ',' if graph_id }}
22 |     cast({{ vertex_1 }} as {{ type_string() }}) as {{ vertex_1 }},
23 |     cast({{ vertex_2 }} as {{ type_string() }}) as {{ vertex_2 }}
24 | from
25 |     {{ input }}
26 | {% endmacro %}
27 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_connect_ordered_graph/test_connect_ordered_graph_2_sg_date.sql:
--------------------------------------------------------------------------------
 1 | with recast as (
 2 |     select
 3 |         id,
 4 |         vertex_1,
 5 |         vertex_2,
 6 |         cast(order_date as date) as order_date
 7 |     from {{ ref('test_connect_ordered_graph_2_sg_date_data') }}
 8 | ),
 9 | 
10 | computed as (
11 |     {{ dbt_graph_theory.connect_ordered_graph(
12 |         input='recast',
13 |         edge_id='id',
14 |         vertex_1='vertex_1',
15 |         vertex_2='vertex_2',
16 |         ordering={"order_date": "date"}
17 |     ) }}
18 | ),
19 | 
20 | required as (
21 |     select v.* from (
22 |         values
23 |         ('1', 'A', 'B', cast('2022-01-01' as date)),
24 |         ('2', 'B', 'C', cast('2022-01-03' as date)),
25 |         ('3', 'C', 'D', cast('2022-01-05' as date)),
26 |         ('4', 'E', 'F', cast('2022-01-08' as date)),
27 |         ('5', 'F', 'G', cast('2022-01-16' as date)),
28 |         ('inserted_edge_1', 'D', 'E', cast('2022-01-07' as date))
29 |     ) as v (id, vertex_1, vertex_2, order_date)
30 | )
31 | 
32 | select * from {{ cte_difference(
33 |     'computed',
34 |     'required',
35 |     fields=["id", "vertex_1", "vertex_2", "order_date"]
36 | ) }}
37 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_connect_ordered_graph/test_connect_ordered_graph_2_sg_numeric_tangled.sql:
--------------------------------------------------------------------------------
 1 | with recast as (
 2 |     select
 3 |         id,
 4 |         vertex_1,
 5 |         vertex_2,
 6 |         order_int
 7 |     from {{ ref('test_connect_ordered_graph_2_sg_numeric_tangled_data') }}
 8 | ),
 9 | 
10 | computed as (
11 |     {{ dbt_graph_theory.connect_ordered_graph(
12 |         input='recast',
13 |         edge_id='id',
14 |         vertex_1='vertex_1',
15 |         vertex_2='vertex_2',
16 |         ordering={"order_int": "numeric"}
17 |     ) }}
18 | ),
19 | 
20 | required as (
21 |     select v.* from (
22 |         values
23 |         ('1', 'A', 'B', 1),
24 |         ('2', 'B', 'C', 5),
25 |         ('3', 'C', 'D', 2),
26 |         ('4', 'E', 'F', 6),
27 |         ('5', 'F', 'G', 3),
28 |         ('6', 'G', 'H', 5),
29 |         -- edges are inserted from the max moving backwards. here, 3 is the min of the "max" subgraph
30 |         -- so an edge is inserted from the edge with ordering 3 to the edge with ordering less than that, being the max in the lower subgraph
31 |         ('inserted_edge_1', 'D', 'F', 2.5)
32 |     ) as v (id, vertex_1, vertex_2, order_int)
33 | )
34 | 
35 | select * from {{ cte_difference(
36 |     'computed',
37 |     'required',
38 |     fields=["id", "vertex_1", "vertex_2", "order_int"]
39 | ) }}
40 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_connect_ordered_graph/test_connect_ordered_graph_3_sg_date.sql:
--------------------------------------------------------------------------------
 1 | with recast as (
 2 |     select
 3 |         id,
 4 |         vertex_1,
 5 |         vertex_2,
 6 |         cast(order_date as date) as order_date
 7 |     from {{ ref('test_connect_ordered_graph_3_sg_date_data') }}
 8 | ),
 9 | 
10 | computed as (
11 |     {{ dbt_graph_theory.connect_ordered_graph(
12 |         input='recast',
13 |         edge_id='id',
14 |         vertex_1='vertex_1',
15 |         vertex_2='vertex_2',
16 |         ordering={"order_date": "date"}
17 |     ) }}
18 | ),
19 | 
20 | required as (
21 |     select v.* from (
22 |         values
23 |         ('1', 'A', 'B', cast('2022-01-01' as date)),
24 |         ('2', 'B', 'C', cast('2022-01-03' as date)),
25 |         ('3', 'C', 'D', cast('2022-01-05' as date)),
26 |         ('4', 'E', 'F', cast('2022-01-08' as date)),
27 |         ('5', 'F', 'G', cast('2022-01-16' as date)),
28 |         ('6', 'H', 'I', cast('2022-01-27' as date)),
29 |         ('inserted_edge_1', 'D', 'E', cast('2022-01-07' as date)),
30 |         ('inserted_edge_2', 'G', 'H', cast('2022-01-26' as date))
31 |     ) as v (id, vertex_1, vertex_2, order_date)
32 | )
33 | 
34 | select * from {{ cte_difference(
35 |     'computed',
36 |     'required',
37 |     fields=["id", "vertex_1", "vertex_2", "order_date"]
38 | ) }}
39 | 


--------------------------------------------------------------------------------
/macros/utils/array_agg.sql:
--------------------------------------------------------------------------------
 1 | {% macro array_agg(field, distinct=false, order_field=none, order=none) %}
 2 |     {{ return(adapter.dispatch("array_agg", macro_namespace="dbt_graph_theory")(field, distinct, order_field, order)) }}
 3 | {% endmacro %}
 4 | 
 5 | {% macro snowflake__array_agg(field, distinct, order_field, order) %}
 6 |     array_agg({{ "distinct" if distinct }} {{ field }}) {{ "within group (order by " ~ order_field ~ " " ~ order ~ ")" if order_field }}
 7 | {% endmacro %}
 8 | 
 9 | {% macro postgres__array_agg(field, distinct, order_field, order) %}
10 |     {# nulls are removed from the array to keep it alligned with the snowflake implementation #}
11 |     array_remove(array_agg({{ "distinct" if distinct }} {{ field }} {{ "order by " ~ order_field ~ " " ~ order if order_field }}), null)
12 | {% endmacro %}
13 | 
14 | {% macro bigquery__array_agg(field, distinct, order_field, order) %}
15 |     {# nulls are removed from the array to keep it alligned with the snowflake implementation #}
16 |     array_agg({{'distinct' if distinct}} {{field}} ignore nulls {{ "order by " ~ order_field ~ " " ~ order if order_field }})
17 | {% endmacro %}
18 | 
19 | {% macro duckdb__array_agg(field, distinct, order_field, order) %}
20 |     list({{ "distinct" if distinct }} {{ field }} {{ "order by " ~ order_field ~ " " ~ order if order_field }})
21 | {% endmacro %}
22 | 
23 | {% macro default__array_agg(field, distinct, order_field, order) %}
24 |     {{ dbt_graph_theory.adapter_missing_exception() }}
25 | {% endmacro %}
26 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs_gid_3_sg.sql:
--------------------------------------------------------------------------------
 1 | with computed as (
 2 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 3 |         input=ref('test_largest_connected_subgraphs_gid_3_sg_data'),
 4 |         graph_id='graph_id'
 5 |     ) }}
 6 | ),
 7 | 
 8 | required as (
 9 |     select v.* from (
10 |         values
11 |         ('1', 'A', '1__1', array['A', 'B', 'C', 'D']),
12 |         ('1', 'B', '1__1', array['A', 'B', 'C', 'D']),
13 |         ('1', 'C', '1__1', array['A', 'B', 'C', 'D']),
14 |         ('1', 'D', '1__1', array['A', 'B', 'C', 'D']),
15 |         ('1', 'E', '1__2', array['E', 'F', 'G', 'H']),
16 |         ('1', 'F', '1__2', array['E', 'F', 'G', 'H']),
17 |         ('1', 'G', '1__2', array['E', 'F', 'G', 'H']),
18 |         ('1', 'H', '1__2', array['E', 'F', 'G', 'H']),
19 |         ('1', 'I', '1__3', array['I', 'J']),
20 |         ('1', 'J', '1__3', array['I', 'J']),
21 |         ('2', 'A', '2__1', array['A', 'B', 'C']),
22 |         ('2', 'B', '2__1', array['A', 'B', 'C']),
23 |         ('2', 'C', '2__1', array['A', 'B', 'C']),
24 |         ('2', 'D', '2__2', array['D']),
25 |         ('2', 'E', '2__3', array['E', 'F', 'G', 'H']),
26 |         ('2', 'F', '2__3', array['E', 'F', 'G', 'H']),
27 |         ('2', 'G', '2__3', array['E', 'F', 'G', 'H']),
28 |         ('2', 'H', '2__3', array['E', 'F', 'G', 'H'])
29 |     ) as v (graph_id, vertex, subgraph_id, subgraph_members)
30 | )
31 | 
32 | select * from {{ cte_difference(
33 |     'computed',
34 |     'required',
35 |     fields=["graph_id", "vertex", "subgraph_id", "subgraph_members"]
36 | ) }}
37 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_connect_ordered_graph/test_connect_ordered_graph_4_sg_timestamp.sql:
--------------------------------------------------------------------------------
 1 | with recast as (
 2 |     select
 3 |         id,
 4 |         vertex_1,
 5 |         vertex_2,
 6 |         cast(order_time as timestamp) as order_time
 7 |     from {{ ref('test_connect_ordered_graph_4_sg_timestamp_data') }}
 8 | ),
 9 | 
10 | computed as (
11 |     {{ dbt_graph_theory.connect_ordered_graph(
12 |         input='recast',
13 |         edge_id='id',
14 |         vertex_1='vertex_1',
15 |         vertex_2='vertex_2',
16 |         ordering={"order_time": "timestamp"}
17 |     ) }}
18 | ),
19 | 
20 | required as (
21 |     select v.* from (
22 |         values
23 |         ('1', 'A', 'B', {{ dbt_graph_theory.cast_timestamp("'2022-01-01 10:26:45'") }}),
24 |         ('2', 'B', 'C', {{ dbt_graph_theory.cast_timestamp("'2022-01-03 15:47:54'") }}),
25 |         ('3', 'C', 'D', {{ dbt_graph_theory.cast_timestamp("'2022-01-05 23:16:16'") }}),
26 |         ('4', 'E', 'F', {{ dbt_graph_theory.cast_timestamp("'2022-01-08 16:06:26'") }}),
27 |         ('5', 'F', 'G', {{ dbt_graph_theory.cast_timestamp("'2022-01-16 04:12:34'") }}),
28 |         ('6', 'H', 'I', {{ dbt_graph_theory.cast_timestamp("'2022-01-27 18:27:15'") }}),
29 |         ('7', 'K', 'L', {{ dbt_graph_theory.cast_timestamp("'2022-01-29 15:23:46'") }}),
30 |         ('8', 'L', 'M', {{ dbt_graph_theory.cast_timestamp("'2022-01-27 19:02:05'") }}),
31 |         ('inserted_edge_1', 'D', 'E', {{ dbt_graph_theory.cast_timestamp("'2022-01-08 16:06:25'") }}),
32 |         ('inserted_edge_2', 'G', 'H', {{ dbt_graph_theory.cast_timestamp("'2022-01-27 18:27:14'") }}),
33 |         -- note that in this example, I and L are connected as these are the closest orderings.
34 |         -- ie satisfying max(subgraph_1) < min(subgraph_2)
35 |         ('inserted_edge_3', 'I', 'L', {{ dbt_graph_theory.cast_timestamp("'2022-01-27 19:02:04'") }})
36 |     ) as v (id, vertex_1, vertex_2, order_time)
37 | )
38 | 
39 | select * from {{ cte_difference(
40 |     'computed',
41 |     'required',
42 |     fields=["id", "vertex_1", "vertex_2", "order_time"]
43 | ) }}
44 | 


--------------------------------------------------------------------------------
/integration_tests/models/test_largest_connected_subgraphs/test_largest_connected_subgraphs.yml:
--------------------------------------------------------------------------------
 1 | version: 2
 2 | models:
 3 |   - name: test_largest_connected_subgraphs_1_sg
 4 |     description: Unit test checking a situation with 1 subgraph and no graph_id.
 5 |     tests:
 6 |       - dbt_expectations.expect_table_row_count_to_equal:
 7 |           value: 0
 8 |   - name: test_largest_connected_subgraphs_4_sg
 9 |     description: Unit test checking a situation with 4 subgraphs and no graph_id.
10 |     tests:
11 |       - dbt_expectations.expect_table_row_count_to_equal:
12 |           value: 0
13 |   - name: test_largest_connected_subgraphs_2_sg_ne
14 |     description: Unit test checking a situation with 1 subgraph, no connecting edges and no graph_id.
15 |     tests:
16 |       - dbt_expectations.expect_table_row_count_to_equal:
17 |           value: 0
18 |   - name: test_largest_connected_subgraphs_3_sg_ne
19 |     description: Unit test checking a situation with 3 subgraphs, no connecting edges and no graph_id.
20 |     tests:
21 |       - dbt_expectations.expect_table_row_count_to_equal:
22 |           value: 0
23 |   - name: test_largest_connected_subgraphs_gid
24 |     description: Unit test checking a situation with 2 graph ids, one with two subgraphs and one with 1 subgraph.
25 |     tests:
26 |       - dbt_expectations.expect_table_row_count_to_equal:
27 |           value: 0
28 |   - name: test_largest_connected_subgraphs_gid_3_sg
29 |     description: Unit test checking a situation with 2 graph ids, with three subgraphs each.
30 |     tests:
31 |       - dbt_expectations.expect_table_row_count_to_equal:
32 |           value: 0
33 |   - name: test_largest_connected_subgraphs_nd
34 |     description: Unit test checking a situation no data.
35 |     tests:
36 |       - dbt_expectations.expect_table_row_count_to_equal:
37 |           value: 0
38 |   - name: test_largest_connected_subgraphs_gid_nd
39 |     description: Unit test checking a situation no data but with graph_id defined.
40 |     tests:
41 |       - dbt_expectations.expect_table_row_count_to_equal:
42 |           value: 0
43 | 
44 | 


--------------------------------------------------------------------------------
/macros/utils/sql_values.sql:
--------------------------------------------------------------------------------
 1 | {# Currently unused - but this was used for bigquery unit tests #}
 2 | {% macro sql_values(data=[],metadata={"names": [], "types": []}, table_alias = "v") %}
 3 | {#
 4 |     sql_values is a list of ordered lists with the relevant sql_values.
 5 |     metadata is a dict with names and types as the two keys. Ordering is same as sql_values
 6 | #}
 7 |     {{ return(adapter.dispatch("sql_values", macro_namespace="dbt_graph_theory")(data, metadata, table_alias)) }}
 8 | {% endmacro %}
 9 | 
10 | {% macro bigquery__sql_values(data, metadata, table_alias) %}
11 | 
12 |     {% set has_types = metadata.get("types", [])|length > 0 %}
13 |     {% set has_names = metadata.get("names", [])|length > 0 %}
14 |     (
15 |     select * from unnest(
16 |         [
17 |         struct
18 |         {% for row in data -%}
19 |             
20 |             {% if has_types %}
21 |             {% set outer_loop = loop %}
22 |             (
23 |             {% for col in row -%}
24 |             {% if has_names and outer_loop.first %}
25 |                 cast( {{ col }} as {{ metadata["types"][loop.index0] }}) as {{ metadata["names"][loop.index0] }} {{',' if not loop.last}}
26 |             {% else %}
27 |                 cast( {{ col }} as {{ metadata["types"][loop.index0] }}) {{',' if not loop.last}}
28 |             {% endif %}
29 |             {%- endfor %}
30 |             )
31 | 
32 |             {% else %}
33 |             
34 |             {% if has_types and outer_loop.first %}
35 |                 {% for col in row %}
36 |                 cast( {{ col }} as {{ metadata["types"][loop.index0] }}) as {{ metadata["names"][loop.index0] }} {{',' if not loop.last}}
37 |                 {% endfor %}
38 |             {% else %}
39 |             ({{ row|join(", ")}}) {{',' if not loop.last}}
40 |             {% endif %}
41 |             {% endif %}
42 |         {% endfor %}
43 |         ]
44 |     )
45 |     )
46 | {% endmacro %}
47 | 
48 | {% macro default__sql_values(data, metadata, table_alias) %}
49 | 
50 |     {% set has_types = metadata.get("types", [])|length > 0 %}
51 |     {% set has_names = metadata.get("names", [])|length > 0 %}
52 |     (
53 |         values
54 |         {% for row in data -%}
55 |             {% if has_types %}
56 |             {% set outer_loop = loop %}
57 |             (
58 |             {% for col in row -%}
59 |                 cast( {{ col }} as {{ metadata["types"][loop.index0] }}) {{',' if not loop.last}}
60 |             {%- endfor %}
61 |             )
62 |             {% else %}
63 |             ({{ row|join(", ")}}) {{',' if not loop.last}}
64 |             {% endif %}
65 |         {% endfor %}
66 |     ) as {{ table_alias }}
67 |     {{ "(" ~ metadata["names"]|join(",") ~ ")" if has_names }}
68 | {% endmacro %}
69 | 


--------------------------------------------------------------------------------
/.github/workflows/ci.yml:
--------------------------------------------------------------------------------
 1 | name: CI
 2 | defaults:
 3 |   run:
 4 |     shell: bash
 5 |     working-directory: integration_tests
 6 | on: [push]
 7 | jobs:
 8 |   test_lint:
 9 |     runs-on: ubuntu-latest
10 |     name: Lint
11 |     steps:
12 |       - name: Checkout source
13 |         uses: actions/checkout@v3
14 |       - name: Setup python
15 |         uses: actions/setup-python@v4
16 |         with:
17 |           python-version: '3.9'
18 |       - name: Setup python venv
19 |         uses: syphar/restore-virtualenv@v1
20 |       - name: Install dependencies
21 |         run: pip install -r requirements/requirements_lint.txt
22 |       - name: Run dbt deps
23 |         run: dbt deps --profiles-dir ci_profiles --target duckdb
24 |       - name: Run dbt seed
25 |         run: dbt seed --profiles-dir ci_profiles --target duckdb
26 |       - name: Run sqlfluff
27 |         run: sqlfluff lint -v -i parsing
28 |   test_postgres:
29 |     runs-on: ubuntu-latest
30 |     name: Run Tests Postgres
31 |     services:
32 |       postgres:
33 |         image: postgres:14.0-alpine
34 |         env:
35 |           POSTGRES_USER: ci_user
36 |           POSTGRES_DB: ci_db
37 |           POSTGRES_PASSWORD: ci
38 |         ports:
39 |           - 5432:5432
40 |         options: --health-cmd pg_isready --health-interval 10s --health-timeout 5s --health-retries 3
41 |     steps:
42 |       - name: Checkout source
43 |         uses: actions/checkout@v3
44 |       - name: Setup python
45 |         uses: actions/setup-python@v4
46 |         with:
47 |           python-version: '3.9'
48 |       - name: Setup python venv
49 |         uses: syphar/restore-virtualenv@v1
50 |       - name: Install dependencies
51 |         run: pip install -r requirements/requirements_postgres.txt
52 |       - name: Run dbt deps
53 |         run: dbt deps --profiles-dir ci_profiles --target postgres
54 |       - name: Run dbt seed
55 |         run: dbt seed --profiles-dir ci_profiles --target postgres
56 |       - name: Run dbt models & tests
57 |         run: dbt build -s dbt_graph_theory_integration_tests --profiles-dir ci_profiles --target postgres
58 |   test_duckdb:
59 |     runs-on: ubuntu-latest
60 |     name: Run Tests DuckDB
61 |     steps:
62 |       - name: Checkout source
63 |         uses: actions/checkout@v3
64 |       - name: Setup python
65 |         uses: actions/setup-python@v4
66 |         with:
67 |           python-version: '3.9'
68 |       - name: Setup python venv
69 |         uses: syphar/restore-virtualenv@v1
70 |       - name: Install dependencies
71 |         run: pip install -r requirements/requirements_duckdb.txt
72 |       - name: Run dbt deps
73 |         run: dbt deps --profiles-dir ci_profiles --target duckdb
74 |       - name: Run dbt seed
75 |         run: dbt seed --profiles-dir ci_profiles --target duckdb
76 |       - name: Run dbt models & tests
77 |         run: dbt build -s dbt_graph_theory_integration_tests --profiles-dir ci_profiles --target duckdb
78 | 


--------------------------------------------------------------------------------
/macros/largest_connected_subgraphs.sql:
--------------------------------------------------------------------------------
  1 | {% macro generate_subgraph_id() %}
  2 |     {{ return(adapter.dispatch("generate_subgraph_id", macro_namespace="dbt_graph_theory")()) }}
  3 | {% endmacro %}
  4 | 
  5 | {% macro bigquery__generate_subgraph_id() %}
  6 | {# handle the bigquery case explicitly, to allow for  #}
  7 | select
  8 |     graph_id,
  9 |     vertex,
 10 |     cast(dense_rank() over (partition by graph_id order by to_json_string(subgraph_members)) as {{ type_string() }}) as subgraph_id
 11 | from node_subgraphs
 12 | {% endmacro %}
 13 | 
 14 | {% macro default__generate_subgraph_id() %}
 15 | select
 16 |     graph_id,
 17 |     vertex,
 18 |     cast(dense_rank() over (partition by graph_id order by subgraph_members) as {{ type_string() }}) as subgraph_id
 19 | from node_subgraphs
 20 | {% endmacro %}
 21 | 
 22 | {% macro largest_connected_subgraphs(
 23 |     input,
 24 |     edge_id='id',
 25 |     vertex_1='vertex_1',
 26 |     vertex_2='vertex_2',
 27 |     graph_id=none
 28 | ) %}
 29 |     {#
 30 |         This macro takes a graph in the given structure, and identifies connected subgraphs of the same table.
 31 |         
 32 |         Required [minimal] table structure:
 33 |         graph_id (Optional, amongoose):
 34 |             An identifier at the graph level (ie. if the table in question represents multiple graphs).
 35 |             When this is not defined, it is assumed that the table represents the one graph.
 36 |         edge_id (amongoose):
 37 |             An identifier of the edge (from vertex_1 to vertex_2). This field should be unique at the graph level.
 38 |         vertex_1 (amongoose):
 39 |             The alias for the first (origin, for directed graphs) vertex of the given edge_id.
 40 |             Nulls are allowed, and correspond to the given vertex_2 not being connected to any other vertices.
 41 |         vertex_2 (amongoose):
 42 |             The alias for the second (destination, for directed graphs) vertex of the given edge_id.
 43 |             Nulls are allowed, and correspond to the given vertex_1 not being connected to any other vertices.
 44 | 
 45 |         It returns a query giving a vertex / graph level table with the following fields:
 46 |         graph_id (amongoose):
 47 |             Identifies the graph based on the input table. If graph_id was not present in the input table, this field is always '1'.
 48 |         vertex (amongoose):
 49 |             Identifies the vertex that the given subgraph and subgraph_members corresponds to. This (as well as graph_id) defines the level of the table.
 50 |         subgraph_id (amongoose):
 51 |             An identifier of the (connected) subgraph for the given vertices for the given edge.
 52 |             This is unique at the graph level.  
 53 |         subgraph_members (array[Any]):
 54 |             An array of the vertices that constitute the given subgraph. The data type of the array is that of the vertex_1 and vertex_2 fields. 
 55 | 
 56 |         Parameters:
 57 |         input (amongoose or a ref / source): The input model or CTE that follows the structure above.
 58 |         edge_id (amongoose): The field corresponding to the edge_id field described above.
 59 |         vertex_1 (amongoose): The field corresponding to the vertex_1 field described above.
 60 |         vertex_2 (amongoose): The field corresponding to the vertex_2 field described above.
 61 |         graph_id (amongoose, Optional, default = None): The field corresponding to the graph_id field described above.
 62 |     #}
 63 | 
 64 |     with recursive enforce_graph as (
 65 |         {{ dbt_graph_theory.enforce_graph_structure(
 66 |             input,
 67 |             edge_id=edge_id,
 68 |             vertex_1=vertex_1,
 69 |             vertex_2=vertex_2,
 70 |             graph_id=graph_id
 71 |         )}}
 72 |     ),
 73 |     
 74 |     all_vertices as (
 75 |         select
 76 |             {{ graph_id if graph_id else "cast('1' as " ~ type_string() ~ ")" }} as graph_id,
 77 |             {{ vertex_1 }} as vertex
 78 |         from enforce_graph
 79 |         where {{ vertex_1 }} is not null
 80 |         {{ dbt_graph_theory.set_union(distinct=true) }}
 81 |         select
 82 |             {{ graph_id if graph_id else "cast('1' as " ~ type_string() ~ ")" }} as graph_id,
 83 |             {{ vertex_2 }} as vertex
 84 |         from enforce_graph
 85 |         where {{ vertex_2 }} is not null
 86 |     ),
 87 | 
 88 |     {# enforce bi-directional edges #}
 89 |     all_edges as (
 90 |         select
 91 |             {{ graph_id if graph_id else "cast('1' as " ~ type_string() ~ ")" }} as graph_id,
 92 |             {{ vertex_1 }} as vertex_1,
 93 |             {{ vertex_2 }} as vertex_2
 94 |         from
 95 |             enforce_graph
 96 |         where
 97 |             coalesce({{ vertex_1 }} != {{ vertex_2 }}, true) and
 98 |             ({{ vertex_1 }} is not null or {{ vertex_2 }} is not null)
 99 |         {{ dbt_graph_theory.set_union(distinct=true) }}
100 |         select
101 |             {{ graph_id if graph_id else "cast('1' as " ~ type_string() ~ ")" }} as graph_id,
102 |             {{ vertex_2 }} as vertex_1,
103 |             {{ vertex_1 }} as vertex_2
104 |         from
105 |             enforce_graph
106 |         where
107 |             coalesce({{ vertex_1 }} != {{ vertex_2 }}, true) and
108 |             ({{ vertex_1 }} is not null or {{ vertex_2 }} is not null)
109 |     ),
110 | 
111 |     graph_walk as (
112 |         select
113 |             all_vertices.graph_id,
114 |             all_vertices.vertex as orig_vertex,
115 |             all_edges.vertex_1,
116 |             all_edges.vertex_2,
117 |             {{ dbt_graph_theory.array_construct(components=['all_edges.vertex_1', 'all_edges.vertex_2']) }} as path_array
118 |         from 
119 |             all_edges
120 |         inner join all_vertices on
121 |             all_vertices.graph_id = all_edges.graph_id and
122 |             all_vertices.vertex = all_edges.vertex_1
123 |         {{ dbt_graph_theory.set_union(distinct=false) }}
124 |         select
125 |             graph_walk.graph_id,
126 |             graph_walk.orig_vertex,
127 |             all_edges.vertex_1,
128 |             all_edges.vertex_2,
129 |             {{ dbt_graph_theory.array_append(array='graph_walk.path_array', new_value='all_edges.vertex_2') }} as path_array
130 |         from
131 |             all_edges
132 |         inner join graph_walk on
133 |             -- walk from the "end" vertex of the last edge to the "start" vertex of the next edge
134 |             -- only walk there if the target vertex has not already been reached on the walk
135 |             -- note: while this does not guarantee full coverage on each path, it means that every reachable vertex from every original vertex has a row.
136 |             graph_walk.graph_id = all_edges.graph_id and
137 |             graph_walk.vertex_2 = all_edges.vertex_1 and
138 |             not({{ dbt_graph_theory.array_contains(array='graph_walk.path_array', value='all_edges.vertex_2') }})
139 |     ),
140 | 
141 |     all_paths as (
142 |         select
143 |             graph_id,
144 |             orig_vertex,
145 |             vertex_1 as end_vertex
146 |         from graph_walk
147 |         where vertex_1 is not null
148 |         {{ dbt_graph_theory.set_union(distinct=true) }}
149 |         select
150 |             graph_id,
151 |             orig_vertex,
152 |             vertex_2 as end_vertex
153 |         from graph_walk
154 |         where vertex_2 is not null
155 |     ),
156 | 
157 |     node_subgraphs as (
158 |         select
159 |             graph_id,
160 |             orig_vertex as vertex,
161 |             {{ dbt_graph_theory.array_agg(
162 |                 field='end_vertex',
163 |                 distinct=true,
164 |                 order_field='end_vertex',
165 |                 order='asc'
166 |             ) }} as subgraph_members
167 |         from all_paths
168 |         group by
169 |             graph_id,
170 |             orig_vertex
171 |     ),
172 | 
173 |     generate_subgraph_id as (
174 |         {{ dbt_graph_theory.generate_subgraph_id() }}
175 |     ),
176 | 
177 |     largest_connected_subgraphs as (
178 |         select
179 |             node_subgraphs.graph_id,
180 |             node_subgraphs.vertex,
181 |             node_subgraphs.subgraph_members,
182 |             concat(
183 |                 {{ 'node_subgraphs.graph_id' if graph_id else "''" }},
184 |                 {{ "'__'," if graph_id }}
185 |                 generate_subgraph_id.subgraph_id
186 |             ) as subgraph_id
187 |         from node_subgraphs
188 |         left join generate_subgraph_id on
189 |             node_subgraphs.graph_id = generate_subgraph_id.graph_id and
190 |             node_subgraphs.vertex = generate_subgraph_id.vertex
191 |     )
192 | 
193 |     select * from largest_connected_subgraphs
194 | {% endmacro %}
195 | 


--------------------------------------------------------------------------------
/macros/connect_ordered_graph.sql:
--------------------------------------------------------------------------------
  1 | {% macro connect_ordered_graph(
  2 |     input,
  3 |     edge_id='id',
  4 |     vertex_1='vertex_1',
  5 |     vertex_2='vertex_2',
  6 |     ordering={'edge_order': 'numeric'},
  7 |     graph_id=none
  8 | ) %}
  9 |     {#
 10 |         This macro takes an ordered graph in the given structure, and connects any unconnected subgraphs.
 11 |         Additional fields are dropped - if these are required, they should be joined back in.
 12 |         
 13 |         Required [minimal] table structure:
 14 |         graph_id (Optional, string):
 15 |             An identifier at the graph level (ie. if the table in question represents multiple graphs).
 16 |             When this is not defined, it is assumed that the table represents the one graph.
 17 |         edge_id (string):
 18 |             An identifier of the edge (from vertex_1 to vertex_2). This field should be unique at the graph level.
 19 |         vertex_1 (string):
 20 |             The alias for the first (origin, for directed graphs) vertex of the given edge_id.
 21 |             Nulls are allowed, and correspond to the given vertex_2 not being connected to any other vertices.
 22 |         vertex_2 (string):
 23 |             The alias for the second (destination, for directed graphs) vertex of the given edge_id.
 24 |             Nulls are allowed, and correspond to the given vertex_1 not being connected to any other vertices.
 25 |         ordering (timestamp, date or numeric):
 26 |             The field corresponding to the order of the edges of the given graph. This is used to connect sensible nodes to each other
 27 |             (ie. in order from one subgraph to the other).
 28 | 
 29 |         It returns a query giving a vertex / graph level table with the following fields:
 30 |         graph_id (string):
 31 |             Identifies the graph based on the input table. If graph_id was not present in the input table, this field is always '1'.
 32 |         vertex (string):
 33 |             Identifies the vertex that the given subgraph and subgraph_members corresponds to. This (as well as graph_id) defines the level of the table.
 34 |         subgraph_id (string):
 35 |             An identifier of the (connected) subgraph for the given vertices for the given edge.
 36 |             This is unique at the graph level.  
 37 |         subgraph_members (array[Any]):
 38 |             An array of the vertices that constitute the given subgraph. The data type of the array is that of the vertex_1 and vertex_2 fields. 
 39 | 
 40 |         Parameters:
 41 |         input (string or a ref / source): The input model or CTE that follows the structure above.
 42 |         edge_id (string): The field corresponding to the edge_id field described above.
 43 |         vertex_1 (string): The field corresponding to the vertex_1 field described above.
 44 |         vertex_2 (string): The field corresponding to the vertex_2 field described above.
 45 |         ordering (dict[string, string]):
 46 |             A dict with key being the field corresponding to the ordering as descripted above,
 47 |             and the value being the data type of the given field.
 48 |             For example, { 'event_time' : 'timestamp' } corresponds to a field named event_time of type timestamp.
 49 |             The data type must be one of: 'timestamp', 'date', 'numeric'.
 50 |         graph_id (string, Optional, default = None): The field corresponding to the graph_id field described above.
 51 |     #}
 52 | 
 53 | {% set supported_ordering_types = ['numeric', 'timestamp', 'date'] %}
 54 | 
 55 | {% set ordering_field = ordering.keys()|list|first %}
 56 | {% set ordering_type = ordering.values()|list|first %}
 57 | 
 58 | {{ exceptions.raise_compiler_error(
 59 |     'Please input a supported ordering type - must be one of: '~ supported_ordering_types
 60 | ) if ordering_type not in supported_ordering_types }}
 61 | 
 62 | with subgraphs as (
 63 |     {{ dbt_graph_theory.largest_connected_subgraphs(
 64 |         input=input,
 65 |         edge_id=edge_id,
 66 |         vertex_1=vertex_1,
 67 |         vertex_2=vertex_2,
 68 |         graph_id=graph_id
 69 |     ) }}
 70 | ),
 71 | 
 72 | enforce_graph_types as (
 73 |     select
 74 |         cast({{ graph_id if graph_id else '1'}} as {{ type_string() }}) as graph_id,
 75 |         cast({{ edge_id }} as {{ type_string() }}) as edge_id,
 76 |         cast({{ vertex_1 }} as {{ type_string() }}) as vertex_1,
 77 |         cast({{ vertex_2 }} as {{ type_string() }}) as vertex_2,
 78 |         {% if ordering_type == 'timestamp' %}
 79 |         {{ dbt_graph_theory.cast_timestamp(ordering_field) }} as ordering
 80 |         {% else %}
 81 |         cast({{ ordering_field }} as {{ordering_type}}) as ordering
 82 |         {% endif %}
 83 |     from
 84 |         {{ input }}
 85 | ),
 86 | 
 87 | from_vertices as (
 88 |     select
 89 |         _input.graph_id,
 90 |         _input.vertex_1 as vertex,
 91 |         _input.ordering,
 92 |         subgraphs.subgraph_id
 93 |     from enforce_graph_types as _input
 94 |     inner join subgraphs on
 95 |         _input.graph_id = subgraphs.graph_id and
 96 |         _input.vertex_1 = subgraphs.vertex
 97 | ),
 98 | 
 99 | to_vertices as (
100 |     select
101 |         _input.graph_id,
102 |         _input.vertex_2 as vertex,
103 |         _input.ordering,
104 |         subgraphs.subgraph_id
105 |     from enforce_graph_types as _input
106 |     inner join subgraphs on
107 |         _input.graph_id = subgraphs.graph_id and
108 |         _input.vertex_2 = subgraphs.vertex
109 | ),
110 | 
111 | vertex_ordering as (
112 |     select
113 |         graph_id,
114 |         vertex,
115 |         ordering
116 |     from from_vertices
117 |     {{ dbt_graph_theory.set_union(distinct=true) }}
118 |     select
119 |         graph_id,
120 |         vertex,
121 |         ordering
122 |     from to_vertices
123 | ),
124 | 
125 | vertex_min_max_ordering as (
126 |     select
127 |         graph_id,
128 |         vertex,
129 |         max(ordering) as max_ordering,
130 |         min(ordering) as min_ordering
131 |     from vertex_ordering
132 |     group by 
133 |         graph_id,
134 |         vertex
135 | ),
136 | 
137 | subgraph_max_min_ordering as (
138 |     select
139 |         subgraphs.graph_id,
140 |         subgraphs.subgraph_id,
141 |         subgraphs.subgraph_members,
142 |         min(orderings.min_ordering) as min_ordering,
143 |         max(orderings.max_ordering) as max_ordering
144 |     from
145 |         subgraphs
146 |     inner join
147 |         vertex_min_max_ordering as orderings on
148 |             subgraphs.graph_id = orderings.graph_id and
149 |             subgraphs.vertex = orderings.vertex
150 |     group by
151 |         subgraphs.graph_id,
152 |         subgraphs.subgraph_id,
153 |         subgraphs.subgraph_members
154 | ),
155 | 
156 | subgraph_lead_lags as (
157 |     select
158 |         graph_id,
159 |         subgraph_id,
160 |         min_ordering,
161 |         max_ordering,
162 |         lag(subgraph_id) over (partition by graph_id order by min_ordering) as lag_subgraph_id,
163 |         lag(max_ordering) over (partition by graph_id order by min_ordering) as lag_max_ordering
164 |     from
165 |         subgraph_max_min_ordering
166 | ),
167 | 
168 | new_edges_join as (
169 |     select
170 |         _required_edges.graph_id,
171 |         to_vertices.vertex as vertex_1,
172 |         to_vertices.ordering as old_ordering,
173 |         from_vertices.vertex as vertex_2,
174 |         from_vertices.ordering as new_ordering,
175 |         -- the join condition ensure that we need to be able to dedupe a > condition, we use this field to do that - ie. we pick the most recent
176 |         -- "to_vertices" row that has joined (ie. is less than the required max ordering (min_ordering in the next subgraph))
177 |         row_number() over (partition by to_vertices.graph_id, to_vertices.subgraph_id order by to_vertices.ordering desc) = 1 as is_max_to_vertex_joined
178 |     from
179 |         subgraph_lead_lags as _required_edges
180 |     inner join
181 |         from_vertices on
182 |             -- this is the vertex in the subgraph that has the min ordering
183 |             _required_edges.graph_id = from_vertices.graph_id and
184 |             _required_edges.subgraph_id = from_vertices.subgraph_id and
185 |             _required_edges.min_ordering = from_vertices.ordering
186 |     inner join
187 |         to_vertices on
188 |             -- these are the vertices in the previous subgraph that have an ordering less than the min of the subgraph
189 |             _required_edges.graph_id = to_vertices.graph_id and
190 |             _required_edges.lag_subgraph_id = to_vertices.subgraph_id and
191 |             _required_edges.min_ordering > to_vertices.ordering
192 |     -- filter out the "first" subgraph id - the remainder is exactly the number of new edges required.
193 |     where _required_edges.lag_max_ordering is not null
194 | ),
195 | 
196 | include_new_edges as (
197 |     select
198 |         {{ 'graph_id as ' ~ graph_id ~ ',' if graph_id }}
199 |         edge_id as {{ edge_id }},
200 |         ordering as {{ ordering_field }},
201 |         vertex_1 as {{ vertex_1 }}, 
202 |         vertex_2 as {{ vertex_2 }}
203 |     from enforce_graph_types
204 |     {{ dbt_graph_theory.set_union(distinct=false) }}
205 |     select
206 |         {{ 'graph_id as ' ~ graph_id ~ ',' if graph_id }}
207 |         concat(
208 |             {{ "cast(graph_id as {{ type_string() }}), '_'," if graph_id }} 
209 |             'inserted_edge_',
210 |             cast(row_number() over (order by graph_id, vertex_1) as {{ type_string() }})
211 |         ) as {{ edge_id }},
212 |         {% if ordering_type == 'timestamp' %}
213 |         case
214 |             when
215 |                 new_ordering = old_ordering or
216 |                 old_ordering = {{ dateadd('second', '-1', 'new_ordering') }}
217 |                 then new_ordering
218 |             else {{ dateadd('second', '-1', 'new_ordering') }}
219 |         end
220 |         {% elif ordering_type == 'date' %}
221 |         case
222 |             when
223 |                 new_ordering = old_ordering or
224 |                 old_ordering = {{ dateadd('day', '-1', 'new_ordering') }}
225 |                 then new_ordering
226 |             else {{ dateadd('day', '-1', 'new_ordering') }}
227 |         end
228 |         {% elif ordering_type == 'numeric' %}
229 |         old_ordering + (new_ordering - old_ordering) / 2
230 |         {% endif %} as {{ ordering_field }},
231 |         vertex_1 as {{ vertex_1 }},
232 |         vertex_2 as {{ vertex_2 }}
233 |     from new_edges_join
234 |     -- drop any incorrect rows joined previously by the > join condition
235 |     where is_max_to_vertex_joined
236 | )
237 | 
238 | select * from include_new_edges
239 | {% endmacro %}
240 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright 2023 James McNeill
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # dbt-graph-theory
  2 | 
  3 | [![Tests](https://github.com/jpmmcneill/dbt-graph-theory/actions/workflows/ci.yml/badge.svg)](https://github.com/jpmmcneill/dbt-graph-theory/actions)
  4 | 
  5 | [![](https://img.shields.io/static/v1?label=dbt-core&message=1.0.0&logo=dbt&logoColor=FF694B&labelColor=5c5c5c&color=047377&style=for-the-badge)](https://github.com/dbt-labs/dbt-core)
  6 | [![](https://img.shields.io/static/v1?label=dbt-utils&message=0.8.0&logo=dbt&logoColor=FF694B&labelColor=5c5c5c&color=047377&style=for-the-badge)](https://github.com/dbt-labs/dbt-utils/)
  7 | 
  8 | A DBT package designed to help SQL based analysis of graphs.
  9 | 
 10 | Supported adapters are: 
 11 | - `dbt-snowflake`
 12 | - `dbt-postgres` (note postgres version >= 10 is required)
 13 | - `dbt-bigquery` (see important note below!!!)
 14 | - `dbt-duckdb` (note duckdb version >= 0.8.1 is required)
 15 | 
 16 | Adapter contributions are welcome! Generally new adapters require additions to the `macros/utils` folder, assuming the given database / engine supports recursive CTEs elegantly. In some cases (namely bigquery), specific array handling was required.
 17 | 
 18 | It's recommended to use the unit test suit to develop new adapters. Please get in touch if you need assistance!
 19 | 
 20 | **IMPORTANT NOTE**:
 21 | BigQuery is untested in the wild, and is quite brittle regarding the `recursive` keyword. Ensure you __only__ macros without CTE nesting - for example, to use `largest_connected_subgraphs`, write SQL like:
 22 | 
 23 | ```sql
 24 | -- model.sql
 25 | {{
 26 |   largest_connected_subgraphs(...)
 27 | }}
 28 | ```
 29 | 
 30 | rather than
 31 | 
 32 | ```sql
 33 | -- model.sql
 34 | with recursive foo as (
 35 |   {{
 36 |     largest_connected_subgraphs(...)
 37 |   }}
 38 | )
 39 | ...
 40 | ```
 41 | 
 42 | This is to ensure that the `recursive` handling works. A feature request to improve this behaviour has been sent to google - please upvote:
 43 | https://issuetracker.google.com/u/1/issues/263510050
 44 | 
 45 | ----
 46 | ## Install
 47 | 
 48 | `dbt-graph-theory` currently supports `dbt 1.0.0` or higher.
 49 | 
 50 | Check [dbt package hub](https://hub.getdbt.com/jpmmcneill/dbt-graph-theory/latest/) for the latest installation instructions, or [read the docs](https://docs.getdbt.com/docs/package-management) for more information on installing packages.
 51 | 
 52 | Include in `packages.yml`
 53 | 
 54 | ```yaml
 55 | packages:
 56 |   - package: jpmmcneill/dbt_graph_theory
 57 |     version: [">=0.1.0", "<0.2.0"]
 58 |     # <see https://github.com/jpmmcneill/dbt-graph-theory/releases/latest> for the latest version tag
 59 | ```
 60 | 
 61 | For latest release, see [https://github.com/jpmmcneill/dbt-graph-theory/releases](https://github.com/jpmmcneill/dbt-graph-theory/releases)
 62 | 
 63 | 
 64 | ----
 65 | ## Introduction
 66 | 
 67 | A [graph](https://en.wikipedia.org/wiki/Graph_(discrete_mathematics)) is a structure defined by a set of vertices and edges.
 68 | 
 69 | ```mermaid
 70 | flowchart
 71 |   A---B
 72 |   A---C
 73 |   B---C
 74 |   C---D
 75 |   B---E
 76 |   E---F
 77 |   A---F
 78 | ```
 79 | 
 80 | The above is a graph with vertices {A, B, C, D, E, F}, and edges described by the lines between vertices. In the context of this package, this graph is represented by the SQL table:
 81 | 
 82 | | edge_id | vertex_1 | vertex_2 |
 83 | |:-------:|:--------:|:--------:|
 84 | |    1    |     A    |     B    |
 85 | |    2    |     A    |     C    |
 86 | |    3    |     B    |     C    |
 87 | |    4    |     C    |     D    |
 88 | |    5    |     B    |     E    |
 89 | |    6    |     E    |     F    |
 90 | |    7    |     A    |     F    |
 91 | 
 92 | In table representation, null vertices represent vertices that are not connected to any other vertices.
 93 | 
 94 | The following tables:
 95 | 
 96 | | edge_id | vertex_1 | vertex_2 |
 97 | |:-------:|:--------:|:--------:|
 98 | |    1    |     A    |          |
 99 | |    2    |     B    |     C    |
100 | 
101 | | edge_id | vertex_1 | vertex_2 |
102 | |:-------:|:--------:|:--------:|
103 | |    1    |          |     A    |
104 | |    2    |     B    |     C    |
105 | 
106 | are equivalent to:
107 | 
108 | ```mermaid
109 | flowchart
110 |   A
111 |   B---C
112 | ```
113 | 
114 | In this package, all rows are considered - meaning that the following tables are equivalent:
115 | 
116 | | edge_id | vertex_1 | vertex_2 |
117 | |:-------:|:--------:|:--------:|
118 | |    1    |          |     A    |
119 | |    2    |     A    |          |
120 | |    3    |          |          |
121 | |    4    |     A    |     B    |
122 | 
123 | | edge_id | vertex_1 | vertex_2 |
124 | |:-------:|:--------:|:--------:|
125 | |    1    |     A    |     B    |
126 | 
127 | This package also supports multiple graphs being represented in the same table:
128 | 
129 | | graph_id | edge_id | vertex_1 | vertex_2 |
130 | |:--------:|:-------:|:--------:|:--------:|
131 | |     1    |    1    |     A    |     B    |
132 | |     1    |    2    |     A    |     C    |
133 | |     1    |    3    |     B    |     C    |
134 | |     1    |    4    |     C    |     D    |
135 | |     2    |    1    |    A'    |    B'    |
136 | |     2    |    2    |    B'    |    C'    |
137 | |     2    |    3    |    C'    |    D'    |
138 | 
139 | ```mermaid
140 | flowchart
141 |   subgraph 1
142 |   A---B
143 |   A---C
144 |   B---C
145 |   C---D
146 |   end
147 |   subgraph 2
148 |   A'---B'
149 |   B'---C'
150 |   C'---D'
151 |   end
152 | ```
153 | 
154 | While in the example above no vertex labels were shared between different `graph_id`s, this is not a strict requirement. When a vertex is shared between two `graph_id`s, the vertices are considered seperate (ie. all algorithms are performed on the graph_id level).
155 | 
156 | `edge_id` should be unique over the table (when `graph_id` is not defined) or at a `graph_id` level when `graph_id` is defined.
157 | 
158 | ### Types of Graph
159 | 
160 | This package currently two types of graphs: Ordered Graphs and Non Ordered Graphs.
161 | There currently no _in data_ difference between these graphs (ie. a given graph table can correspond to both types), but some macros are designed to be implemented on specific types of graph. For example, `connect_ordered_graph` is designed to only be used with ordered graphs.
162 | 
163 | #### Ordered Graphs
164 | 
165 | Ordered graphs have a natural order to their nodes. In this package, the ordering is typically implemented via edges - meaning that this package treats ordered graphs as an extension of induced ordered graphs. For more detail, see [wikipedia](https://en.wikipedia.org/wiki/Ordered_graph). Ordered graphs are typically interacted with via an `ordering` field - this corresponds to the order of the edges / nodes. Supported Ordering types are numeric and date(time).
166 | 
167 | An example of an ordered graph might be a rename path, or a customer's subscription history. These would both be ordered via a timestamp at which the rename happened / the customer swapped subscription type (respectively). 
168 | 
169 | #### Non Ordered Graphs
170 | 
171 | Non ordered graphs are simply graphs that do not fall into the above definition.
172 | 
173 | 
174 | ----
175 | ## Variables
176 | 
177 | This package currently has no variables that need to be configured.
178 | 
179 | ----
180 | ## Integration Tests (Developers Only)
181 | 
182 | This section assumes development on a mac, where python3 & postgresql are installed.
183 | 
184 | At the moment, postgres and duckdb integration tests are implemented.
185 | 
186 | ### Setting up python environment 
187 | 
188 | Integration tests for this repository are managed via the `integration_tests` folder.
189 | 
190 | To set up a python3 virtual environment, run the following in order from the `integration_tests` folder.
191 | 
192 | ```
193 | python3 -m venv ci_venv
194 | source ci_venv/bin/activate
195 | pip install -r requirements/requirements_<adapter>.txt
196 | ```
197 | 
198 | ie. to use postgres, run `pip install -r requirements/requirements_postgres.txt
199 | 
200 | To exit the virtual environment, simply run:
201 | 
202 | ```
203 | deactivate
204 | ```
205 | 
206 | By default, dbt runs against postgres on setup. You can use any of the adapters listed (the best way of seeing these is by the requirements files).
207 | Each adapter has a specific target in the ci_profiles.yml. You can specifc these with the `--target` flag in dbt - ie. `dbt run --profiles-dir ci_profiles --target ci_duckdb` will run dbt with duckdb.
208 | 
209 | The easiest way to follow along with this is with the CI file specified in `.github/workflows/ci.yml`
210 | 
211 | ### Setting up postgres server 
212 | 
213 | Postgres integration tests run on a local postgres server. The below assumes postgres has been installed via homebrew.
214 | 
215 | A postgres server can be spun up with the following:
216 | 
217 | ```
218 | bash bin/setup-postgres    --sets up a postgres server for running integration tests locally
219 | ```
220 | 
221 | Note that both of these silence stdout (ie error / success messages) so you may experience unexpected behaviour. If so - please raise an issue on GitHub.
222 | 
223 | Once, a postgres server has been setup, it can be started and stopped with the following:
224 | 
225 | ```
226 | bash bin/start-postgres    --starts the postgres server for running integration tests locally
227 | bash bin/stop-postgres    --stops the postgres server for running integration tests locally
228 | ```
229 | 
230 | These can be useful when you want to persist data from previous runs of integration tests but not constantly run a postgres server.
231 | 
232 | Finally, the postgres server can be destroyed via:
233 | 
234 | ```
235 | bash bin/destroy-postgres    --destroys the postgres server for running integration tests locally
236 | ```
237 | 
238 | ### Running dbt
239 | 
240 | To run dbt, simply run dbt commands as usual, specifying the CI profile & selecting the integration tests:
241 | 
242 | ```
243 | dbt clean
244 | dbt deps
245 | dbt seed --profiles-dir ci_profiles
246 | dbt build -s dbt_graph_theory_integration_tests --profiles-dir ci_profiles
247 | ```
248 | 
249 | It is often easier to set a DBT_PROFILES_DIR environment variable to remove the need for the `--profiles-dir ci_profiles` part of the above:
250 | 
251 | ```bash
252 | export DBT_PROFILES_DIR=ci_profiles
253 | ```
254 | 
255 | Remember to undo this when you are finished with:
256 | 
257 | ```
258 | unset DBT_PROFILES_DIR
259 | ```
260 | 
261 | The style of integration test is raw data that is seeded and validated against by running a model and running tests that should pass when the expected result is present.
262 | 
263 | ### Viewing local data
264 | 
265 | To view data generated in the integration tests locally, simply connect to the database and query the given table.
266 | 
267 | The details change between adapters. For postgres:
268 | 
269 | ```
270 | psql ci_db
271 | select * from <table>;
272 | ...
273 | quit -- to end the server connection
274 | ```
275 | 
276 | All CI models are required to run and pass tests for a merge to be allowed.
277 | 
278 | ----
279 | ## Contents
280 | 
281 | **[Generic tests](#generic-tests)**
282 |   - [graph_is_connected](#graph_is_connected)
283 | 
284 | **[Macros](#macros)**
285 |   - [enforce_graph_structure](#enforce_graph_structure)
286 |   - [connect_ordered_graph](#connect_ordered_graph)
287 |   - [largest_connected_subgraphs](#largest_connected_subgraphs)
288 | 
289 | **[Helper Macros](#helper-macros)**
290 |   - [array_agg](#array_agg)
291 |   - [array_append](#array_append)
292 |   - [array_construct](#array_construct)
293 |   - [array_contains](#array_contains)
294 | 
295 | ## Generic Tests
296 | 
297 | ### [graph_is_connected](tests/generic/graph_is_connected.sql)
298 | 
299 | Arguments:
300 | - edge_id [text]: the name of the field for the edge_id column in the given table graph representation.
301 | - vertex_1 [text]: the name of the field for the vertex_1 column in the given table graph representation.
302 | - vertex_2 [text]: the name of the field for the vertex_2 column in the given table graph representation.
303 | - graph_id [Optional, text]: the name of the field for the graph_id column in the given table graph representation.
304 | 
305 | Usage:
306 | ```yaml
307 | models:
308 |   - name: <model_name>
309 |     tests:
310 |       - dbt_graph_theory.graph_is_connected:
311 |           graph_id: ...
312 |           edge_id: ...
313 |           vertex_1: ...
314 |           vertex_2: ...
315 | ```
316 | 
317 | Tests whether the given model (a table representation of a graph) is connected. A connected graph is defined as one where a path exists between any two nodes. As an example, the below graph is not connected:
318 | 
319 | ```mermaid
320 | flowchart
321 |   A---B
322 |   B---C
323 |   D---E
324 |   E---F
325 |   D---F
326 |   E---G
327 | ```
328 | 
329 | ## Macros
330 | 
331 | ### [enforce_graph_structure](macros/enforce_graph_structure.sql)
332 | Readme is TODO
333 | 
334 | ### [connect_ordered_graph](macros/connect_ordered_graph.sql)
335 | 
336 | Arguments:
337 | - input: the input node (inputted as `ref(...)` or `source(...)`) or CTE (inputted as a string)
338 | - edge_id [text]: the name of the field for the edge_id column in the given table graph representation.
339 | - vertex_1 [Any]: the name of the field for the vertex_1 column in the given table graph representation.
340 | - vertex_2 [Any]: the name of the field for the vertex_2 column in the given table graph representation.
341 | - ordering [Dict[text,text]]: the field (and data type) corresponding to the ordering of the given edges. For example, {'timestamp_field': 'timestamp'} corresponds to a field named `timestamp_field` of type `timestamp` being the ordering field. Data types must be one of: 'timestamp', 'date', 'numeric'.
342 | - graph_id [Optional, text]: the name of the field for the graph_id column in the given table graph representation.
343 | 
344 | **Usage:**
345 | ```sql
346 | with connect_subgraphs as (
347 |     {{ dbt_graph_theory.connect_ordered_graph(
348 |         input=ref('example_model'),
349 |         edge_id='edge_id_field_name',
350 |         vertex_1='vertex_1_field_name',
351 |         vertex_2='vertex_2_field_name',
352 |         ordering={'ordering_field': 'numeric'},
353 |         graph_id='graph_id_field_name'
354 |     ) }}
355 | )
356 | ...
357 | ```
358 | 
359 | ```sql
360 | ...
361 | with connect_subgraphs as (
362 |     {{ dbt_graph_theory.connect_ordered_graph(
363 |         input=ref('example_model'),
364 |         edge_id='edge_id_field_name',
365 |         vertex_1='vertex_1_field_name',
366 |         vertex_2='vertex_2_field_name',
367 |         ordering={'different_ordering_field': 'timestamp'}
368 |     }}
369 | )
370 | ...
371 | ```
372 | 
373 | This connects disconnected subgraphs (at the graph_id level) based on a ranking of orderings. This can only be applied on **ordered** graphs, meaning that an `ordering` must be provided via the `ordering` kwarg.
374 | 
375 | In the below graph (the text on arrows is the ordering - in this case, numeric):
376 | 
377 | ```mermaid
378 | flowchart
379 |   A--->|1|B
380 |   B--->|2|C
381 |   D--->|3|E
382 |   E--->|4|F
383 | ```
384 | 
385 | The following graph (in table representation) is returned:
386 | 
387 | ```mermaid
388 | flowchart
389 |   A--->|1|B
390 |   B--->|2|C
391 |   C--->|2.5|D
392 |   D--->|3|E
393 |   E--->|4|F
394 | ```
395 | 
396 | The table in this case will have a new row in the output: 
397 | 
398 | | graph_id |     edge_id     | vertex_2 | vertex_1 | ordering_field |
399 | |:--------:|:---------------:|:--------:|:--------:|:--------------:|
400 | |     1    | inserted_edge_1 |     C    |     D    |       2.5      |
401 | 
402 | The orderings of inserted edges are dependent on the data type in question.
403 | 
404 | Timestamps will be 1 second behind the later edge that is being connected to. For example:
405 | 
406 | ```mermaid
407 | flowchart
408 |   A--->|2021-01-01 10:25:15|B
409 |   B--->|2021-01-04 17:43:45|C
410 |   D--->|2021-01-05 14:02:05|E
411 | ```
412 | 
413 | would become
414 | 
415 | ```mermaid
416 | flowchart
417 |   A--->|2021-01-01 10:25:15|B
418 |   B--->|2021-01-04 17:43:45|C
419 |   C--->|2021-01-05 14:02:04|D
420 |   D--->|2021-01-05 14:02:05|E
421 | ```
422 | 
423 | Dates will be 1 day behind the later edge (if this does not conflict with the earlier edge, in which case the date is left equal):
424 | 
425 | ```mermaid
426 | flowchart
427 |   A--->|2021-01-01|B
428 |   B--->|2021-01-04|C
429 |   D--->|2021-01-05|E
430 | ```
431 | 
432 | would become
433 | 
434 | ```mermaid
435 | flowchart
436 |   A--->|2021-01-01|B
437 |   B--->|2021-01-04|C
438 |   C--->|2021-01-05|D
439 |   D--->|2021-01-05|E
440 | ```
441 | 
442 | and
443 | 
444 | ```mermaid
445 | flowchart
446 |   A--->|2021-01-01|B
447 |   B--->|2021-01-04|C
448 |   D--->|2021-01-07|E
449 | ```
450 | 
451 | would become
452 | 
453 | ```mermaid
454 | flowchart
455 |   A--->|2021-01-01|B
456 |   B--->|2021-01-04|C
457 |   C--->|2021-01-06|D
458 |   D--->|2021-01-07|E
459 | ```
460 | 
461 | Numerics go exactly in between the two nodes being connected, as demonstrated in the above example.
462 | 
463 | 
464 | ### [largest_connected_subgraphs](macros/largest_connected_subgraphs.sql)
465 | 
466 | Arguments:
467 | - input: the input node (inputted as `ref(...)` or `source(...)`) or CTE (inputted as a string)
468 | - edge_id [text]: the name of the field for the edge_id column in the given table graph representation.
469 | - vertex_1 [Any]: the name of the field for the vertex_1 column in the given table graph representation.
470 | - vertex_2 [Any]: the name of the field for the vertex_2 column in the given table graph representation.
471 | - graph_id [Optional, text]: the name of the field for the graph_id column in the given table graph representation.
472 | 
473 | **Usage:**
474 | ```sql
475 | with subgraphs as (
476 |     {{ dbt_graph_theory.largest_connected_subgraphs(
477 |         input=ref('example_model'),
478 |         edge_id='edge_id_field_name',
479 |         vertex_1='vertex_1_field_name',
480 |         vertex_2='vertex_2_field_name',
481 |         graph_id='graph_id_field_name'
482 |     ) }}
483 | )
484 | ...
485 | ```
486 | 
487 | ```sql
488 | ...
489 | subgraphs as (
490 |     {{ dbt_graph_theory.largest_connected_subgraphs(
491 |         input='example_cte',
492 |         edge_id='different_edge_id_field_name',
493 |         vertex_1='different_vertex_1_field_name',
494 |         vertex_2='different_vertex_2_field_name'
495 |     ) }}
496 | )
497 | ...
498 | ```
499 | 
500 | This macro groups vertices into the largest connected subgraph that they are a member of.
501 | 
502 | In the below graph:
503 | 
504 | ```mermaid
505 | flowchart
506 |   A---B
507 |   B---C
508 |   D---E
509 |   E---F
510 |   D---F
511 |   E---G
512 | ```
513 | 
514 | The following table is returned:
515 | 
516 | |  vertex  | subgraph_id |  subgraph_members  |
517 | |:--------:|:-----------:|:------------------:|
518 | |     A    |      1      |  ['A', 'B', 'C']   |
519 | |     B    |      1      |  ['A', 'B', 'C']   |
520 | |     C    |      2      |  ['A', 'B', 'C']   |
521 | |     D    |      2      |['D', 'E', 'F', 'G']|
522 | |     E    |      2      |['D', 'E', 'F', 'G']|
523 | |     F    |      2      |['D', 'E', 'F', 'G']|
524 | |     G    |      2      |['D', 'E', 'F', 'G']|
525 | 
526 | subgraph_id is designed to be unique to both the graph and subgraph level. When graph_id is defined, the output is also at a graph_id level.
527 | 
528 | ## Helper Macros
529 | Note that the below are designed for internal (ie. dbt-graph-theory) use only. Use them at your own risk!
530 | ### [array_agg](macros/utils/array_agg.sql)
531 | 
532 | Arguments:
533 | 
534 | - field [text]: the field to be aggregated into an array (inputted as `'field_name'`).
535 | - distinct [optional, bool]: whether the aggregation should only include distinct values (inputted as `true/false`). Defaults to `false`.
536 | - order_field [optional, text]: the field that the array elements should be ordered by (inputted as `'field_name'`). Defaults to `none`.
537 | - order [optional, text]: the ordering that the order_field should be ordered by (inputted as `'field_name'`). Defaults to `none`.
538 | 
539 | This is an adapter specific macro for aggregating a column into an array.
540 | 
541 | This macro excludes nulls.
542 | 
543 | **Usage:**
544 | ```sql
545 | select
546 |     date_month,
547 |     {{ dbt_graph_theory.array_agg(field='customer_id') }} as customer_array
548 | from {{ ref('model') }}
549 | group by date_month
550 | ```
551 | 
552 | ```sql
553 | select
554 |     date_month,
555 |     {{ dbt_graph_theory.array_agg(field='customer_id', distinct=true, order_field='num_orders') }} as customer_array
556 | from {{ ref('model') }}
557 | group by date_month
558 | ```
559 | 
560 | ### [array_append](macros/utils/array_append.sql)
561 | 
562 | Arguments:
563 | 
564 | - array [text]: the array field where new values should be appended into (inputted as `'field_name'`).
565 | - new_value [text]: the field (or value) to be appended to the existing array(inputted as `'field_name'` or `'value'`).
566 | 
567 | This is an adapter specific macro for appending a new value into an array.
568 | 
569 | **Usage:**
570 | ```sql
571 | select
572 |     {{ dbt_graph_theory.array_append('existing_array', 'new_field_to_append') }} as updated_existing_array
573 | from {{ ref('model') }}
574 | ```
575 | 
576 | ```sql
577 | select
578 |     {{ dbt_graph_theory.array_append('existing_array', "'a_hardcoded_string'") }} as updated_existing_array
579 | from {{ ref('model') }}
580 | ```
581 | 
582 | ### [array_construct](macros/utils/array_construct.sql)
583 | 
584 | Arguments:
585 | 
586 | - components [list[text]]: the jinja list which will be used (in order) for the array's construction.
587 | 
588 | This is an adapter specific macro for constructuring an array from a list of values.
589 | 
590 | **Usage:**
591 | ```sql
592 | {% set list = ['field_one', 'field_two', "'hardcoded_string'"] %}
593 | select
594 |     {{ dbt_graph_theory.array_construct(components=list) }} as constructed_array
595 | from {{ ref('model') }}
596 | ```
597 | 
598 | ```sql
599 | select
600 |     {{ dbt_graph_theory.array_construct(components=['1', 'a_field']) }} as constructed_array
601 | from {{ ref('model') }}
602 | ```
603 | 
604 | ### [array_contains](macros/utils/array_contains.sql)
605 | 
606 | Arguments:
607 | 
608 | - array [text]: the array field which will be checked for inclusion of the given value.
609 | - value [text]: the field (or hardcoded data) which is checked for in the given array.
610 | 
611 | This is an adapter specific macro to test whether a value is contained within an array.
612 | 
613 | **Usage:**
614 | ```sql
615 | select
616 |     {{ dbt_graph_theory.array_contains(array='array_field', value='other_value') }} as value_contained__bool
617 | from {{ ref('model') }}
618 | ```
619 | 
620 | **Note**: README structure inspired by dbt-lab's [dbt-utils](https://github.com/dbt-labs/dbt-utils) and calogica's [dbt-date](https://github.com/calogica/dbt-date) and [dbt-expectations](https://github.com/calogica/dbt-expectations).
621 | 
622 | ## Exceptions
623 | These macros are for internal package use only. They should help with raising consistend error messages.
624 | ### [adapter_missing_exception](macros/exceptions/adapter_missing_exception.sql)
625 | 
626 | Arguments: `none`
627 | 
628 | Raise an exception to highlight that the given adapter is not currently supported.
629 | 
630 | **Usage:**
631 | ```sql
632 | ...
633 | {{ dbt_graph_theory.adapter_missing_exception() }}
634 | ...
635 | ```
636 | 


--------------------------------------------------------------------------------