├── .github └── workflows │ └── extension_release.yml ├── .gitignore ├── CONTRIBUTING.md ├── LICENSE ├── README.md └── extension ├── .cargo └── config.toml ├── .gitignore ├── Cargo.toml ├── Trunk.toml ├── assets └── PG_AUTO_DW_LOGO.png ├── docs ├── readme.md └── sql_functions │ ├── go.md │ ├── health.md │ ├── readme.md │ ├── source_columns.md │ ├── source_exclude.md │ ├── source_include.md │ ├── source_tables.md │ └── update_context.md ├── pg_auto_dw.control └── src ├── bin └── pgrx_embed_pg_auto_dw.rs ├── controller ├── bgw_init.rs ├── bgw_source_objects.rs ├── bgw_transformer_client.rs ├── dv_builder.rs ├── dv_loader.rs └── mod.rs ├── lib.rs ├── model ├── dv_schema.rs ├── mod.rs ├── prompt_template.rs ├── queries.rs └── source_objects.rs └── utility ├── guc.rs ├── mod.rs ├── ollama_client.rs ├── openai_client.rs ├── setup.rs ├── sql └── info_tables.sql └── transformer_client.rs /.github/workflows/extension_release.yml: -------------------------------------------------------------------------------- 1 | name: Release pg_auto_dw 2 | 3 | defaults: 4 | run: 5 | shell: bash 6 | working-directory: ./extension 7 | 8 | on: 9 | pull_request: 10 | branches: 11 | - main 12 | paths-ignore: 13 | - "README.md" 14 | push: 15 | branches: 16 | - main 17 | paths-ignore: 18 | - "README.md" 19 | release: 20 | types: 21 | - created 22 | jobs: 23 | publish: 24 | if: github.event_name == 'release' 25 | name: trunk publish 26 | runs-on: ubuntu-latest 27 | strategy: 28 | matrix: 29 | pg-version: [14, 15, 16, 17] 30 | steps: 31 | - uses: actions/checkout@v2 32 | - name: Install Rust stable toolchain 33 | uses: actions-rs/toolchain@v1 34 | with: 35 | toolchain: stable 36 | - name: Install stoml and pg-trunk 37 | shell: bash 38 | run: | 39 | set -xe 40 | wget https://github.com/freshautomations/stoml/releases/download/v0.7.1/stoml_linux_amd64 &> /dev/null 41 | mv stoml_linux_amd64 stoml 42 | chmod +x stoml 43 | sudo mv stoml /usr/local/bin/ 44 | cargo install pg-trunk 45 | - name: trunk build 46 | working-directory: ./extension 47 | run: | 48 | ~/.cargo/bin/trunk build --pg-version ${{ matrix.pg-version }} 49 | - name: trunk publish 50 | working-directory: ./extension 51 | env: 52 | TRUNK_API_TOKEN: ${{ secrets.TRUNK_AUTH_TOKEN }} 53 | run: ~/.cargo/bin/trunk publish 54 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .DS_Store 2 | .idea/ 3 | /target 4 | *.iml 5 | **/*.rs.bk 6 | Cargo.lock 7 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to `pg_auto_dw` 2 | 3 | ## Releases 4 | 5 | `pg_auto_dw` follows [semantic versioning](semver.org) and is released to [pgt.dev](https://pgt.dev/extensions/pg_auto_dw). 6 | 7 | To release, follow these steps: 8 | 9 | 1. Create a PR updating the version in `Cargo.toml` and `Trunk.toml`. These two values must agree. 10 | 2. Merge the PR into the `main` branch. 11 | 3. [Create the release](https://github.com/tembo-io/pg_auto_dw/releases/new) 12 | 1. Use the tag format `vX.Y.Z` where `X.Y.Z` is the version number. e.g. `v0.1.0`. This version should be the same value as in `Cargo.toml` and `Trunk.toml`. 13 | 2. Click "Generated release notes" to auto-populate the release notes or fill in with your own content and notes. 14 | 3. Click "Publish release" 15 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | The PostgreSQL License 2 | 3 | Copyright (c) 2024, Tembo 4 | 5 | Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies. 6 | 7 | IN NO EVENT SHALL TEMBO BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF TEMBO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 8 | 9 | TEMBO SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND TEMBO HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # pg_auto_dw 2 | 3 | 4 | 5 | [![Static Badge](https://img.shields.io/badge/%40tembo-community?logo=slack&label=slack)](https://join.slack.com/t/tembocommunity/shared_invite/zt-277pu7chi-NHtvHWvLhHwyK0Y5Y6vTPw) 6 | [![OSSRank](https://shields.io/endpoint?url=https://ossrank.com/shield/4020)](https://ossrank.com/p/4020) 7 | [![Warning: Under Active Development](https://img.shields.io/badge/Warning-Under_Active_Development-yellow)](https://github.com/tembo-io/pg_auto_dw) 8 | 9 | 10 | ## Overview 11 | 12 | `pg_auto_dw` is a [permissively-licensed open-source](LICENSE) Postgres Extension that automates the creation of a Postgres-based data warehouse, given one or more transactional Postgres database inputs. 13 | 14 | We aim to do this within a structured environment that incorporates best practices and harnesses the capabilities of Large Language Models (LLM) technologies. 15 | 16 | We are starting with automation to facilitate a data vault implementation for our data warehouse. This will be a rudimentary raw vault setup, but we hope it will lead to substantial downstream business models. 17 | 18 | 19 | ## Goals 20 | 21 | - Automate the DW Build 22 | - Automate DW Maintenance 23 | - Understand DW Health 24 | - Support Data Governance 25 | 26 | These capabilities will be delivered through a [small set of Postgres functions](extension/docs/sql_functions/readme.md). 27 | 28 | ## Walkthrough 29 | 30 | ### Setup 31 | 32 | 1. Install extension 33 | 34 | ```SQL 35 | DROP EXTENSION IF EXISTS pg_auto_dw CASCADE; 36 | CREATE EXTENSION pg_auto_dw; 37 | ``` 38 | 39 | > Installing this extension installs a couple source sample tables in the PUBLIC SCHEMA as well as the PG_CRYPTO extension. 40 | 41 | 1. Restart your Postgres instance. 42 | 43 | 1. Create a destination schema 44 | 45 | Choose a name for a schema for your data warehouse to be built into. 46 | 47 | ```SQL 48 | DROP SCHEMA IF EXISTS my_dw CASCADE; 49 | CREATE SCHEMA my_dw; 50 | ``` 51 | 52 | 1. Reload configuration 53 | 54 | ```SQL 55 | SELECT pg_reload_conf(); 56 | ``` 57 | 58 | 1. Confirm setup 59 | 60 | ```SQL 61 | SHOW pg_auto_dw.database_name; 62 | SHOW pg_auto_dw.dw_schema; 63 | ``` 64 | This should return `postgres` and the schema name you selected. 65 | 66 | 1. Set your LLM and reload configuration 67 | 68 | ```SQL 69 | ALTER SYSTEM SET pg_auto_dw.model TO 'gpt-4o'; 70 | ALTER SYSTEM SET pg_auto_dw.transformer_server_type TO 'openai'; 71 | ALTER SYSTEM SET pg_auto_dw.transformer_server_url TO 'https://api.openai.com/v1/chat/completions'; 72 | ALTER SYSTEM SET pg_auto_dw.transformer_server_token TO 'xxx'; 73 | SELECT pg_reload_conf(); 74 | ``` 75 | 76 | ### Load sample data 77 | 78 | ```SQL 79 | DROP TABLE IF EXISTS public.seller; 80 | CREATE TABLE public.seller ( 81 | seller_id UUID PRIMARY KEY, -- Designating seller_id as the primary key 82 | city VARCHAR(255), 83 | state CHAR(2), 84 | zip_5 VARCHAR(10) 85 | ); 86 | 87 | INSERT INTO public.seller (seller_id, city, state, zip_5) VALUES 88 | ('9449f25aeaf531019b76999ea49a6949','rio de janeiro','RJ','21040'), 89 | ('9bc484c87d79cd4874e05ca182658045','sao paulo','SP','02422'), 90 | ('3442f8959a84dea7ee197c632cb2df15','campinas','SP','13023'), 91 | ('d149de2f383552baea37a7198c2296ce','sao paulo','SP','04193'), 92 | ('c747d5b92c7648417faea95d36d763e8','pacatuba','CE','61800'), 93 | ('455f46ef09a9e45667e2981df84b5cc2','sorocaba','SP','18052'), 94 | ('8ff38bc3969e67c36c48343a07090f66','sao paulo','SP','08260'), 95 | ('50bf89f1349bc0409a268c3a49678009','jaci','SP','15155'), 96 | ('323ce52b5b81df2cd804b017b7f09aa7','sao paulo','SP','03306'), 97 | ('1284de4ae8aa26997e748c851557cf0e','laranjeiras do sul','SP','85301'), 98 | ('f80edd2c5aaa505cc4b0a3b219abf4b8','sao paulo','SP','03431'); 99 | 100 | DROP TABLE IF EXISTS public.orders; 101 | CREATE TABLE public.orders ( 102 | order_id UUID PRIMARY KEY, 103 | seller_id UUID, 104 | order_date timestamp, 105 | order_amount NUMERIC(10,2) 106 | ); 107 | 108 | INSERT INTO public.orders (order_id, seller_id, order_date, order_amount) VALUES 109 | (gen_random_uuid(), '9449f25aeaf531019b76999ea49a6949', now(), 20.01), 110 | (gen_random_uuid(), '9449f25aeaf531019b76999ea49a6949', now(), 44.01), 111 | (gen_random_uuid(), '9bc484c87d79cd4874e05ca182658045', now(), 99.03); 112 | ``` 113 | 114 | ### Build Data Warehouse 115 | 116 | 1. Set your sources 117 | 118 | ```SQL 119 | SELECT auto_dw.source_include('public', 'seller'); 120 | SELECT auto_dw.source_include('public', 'orders'); 121 | ``` 122 | 123 | Postgres Regex is used behind the scenes. To do an exact match, use: 124 | 125 | ```SQL 126 | SELECT auto_dw.source_include('public', '^sellers$'); 127 | ``` 128 | 129 | 1. Confirm the table columns are queued for processing 130 | 131 | ```SQL 132 | SELECT * FROM auto_dw.source_column(); 133 | ``` 134 | 135 | You should see a list of columns with status `Queued for Processing`. 136 | 137 | 1. Go 138 | 139 | ```SQL 140 | SELECT auto_dw.go(); 141 | ``` 142 | 143 | ### Accessing your data 144 | 145 | Here's an example materialized view to pull the data together into a flat view. 146 | 147 | ```SQL 148 | CREATE MATERIALIZED VIEW my_mat_view AS 149 | SELECT 150 | sat_orders.order_date, 151 | sat_orders.order_amount, 152 | sat_seller.city, 153 | sat_seller.state, 154 | sat_seller.zip_5 155 | FROM dw_dev.link_order_seller 156 | LEFT JOIN dw_dev.sat_orders ON link_order_seller.link_order_seller_hk = sat_orders.link_order_seller_hk 157 | LEFT JOIN dw_dev.hub_seller ON link_order_seller.hub_seller_hk = hub_seller.hub_seller_hk 158 | LEFT JOIN dw_dev.sat_seller ON hub_seller.hub_seller_hk = sat_seller.hub_seller_hk; 159 | ; 160 | ``` 161 | 162 | ### Tips 163 | 164 | If your field isn't being interpreted correctly, try adding a comment to the field, which the LLM does consider. 165 | 166 | ```SQL 167 | COMMENT ON COLUMN public.orders.seller_id IS 'is business key'; 168 | ``` 169 | 170 | ## Setting up foreign data wrappers 171 | 172 | The example above reads data from the same instance that it's writing to. Normally you'd want to isolate analytical workloads from transactional workloads. 173 | 174 | You can use Postgres foreign data wrapper functionality to accomplish this. 175 | 176 | ```SQL 177 | -- Enable the postgres_fdw extension 178 | CREATE EXTENSION postgres_fdw; 179 | 180 | -- Inspect existing foreign servers 181 | SELECT * FROM pg_foreign_server; -- Run on the previously configured client system to inspect existing foreign servers. 182 | 183 | -- Create a new foreign server 184 | CREATE SERVER foreign_server 185 | FOREIGN DATA WRAPPER postgres_fdw 186 | OPTIONS (host 'remote_server_ip', dbname 'foreign_db', port '5432'); 187 | 188 | -- Inspect existing user mappings (if applicable) 189 | SELECT * FROM pg_user_mappings; -- Run on the previously configured client system to view user mappings for foreign servers. 190 | 191 | -- Create a user mapping for the foreign server 192 | CREATE USER MAPPING FOR local_user 193 | SERVER foreign_server 194 | OPTIONS (user 'foreign_user', password 'password'); 195 | 196 | -- Manually define a foreign table 197 | CREATE FOREIGN TABLE foreign_table_name ( 198 | column1 datatype, -- Replace with the column name and datatype in the local schema. 199 | column2 datatype -- Repeat for additional columns. 200 | ) 201 | SERVER foreign_server 202 | OPTIONS ( 203 | schema_name 'public', -- Schema name of the source table in the foreign server. 204 | table_name 'source_table' -- Table name in the foreign server. 205 | ); 206 | 207 | -- Automatically via Schema 208 | -- Use this approach to bulk import tables, minimizing manual effort. 209 | IMPORT FOREIGN SCHEMA public -- Replace 'public' with the schema name in the foreign server. 210 | FROM SERVER foreign_server -- Specify the name of the foreign server. 211 | INTO local_schema; -- Replace 'local_schema' with the schema name in the client system. 212 | ``` 213 | 214 | ## Advanced Demo: Auto Data Governance 215 | 216 | Sometimes it’s best to get a little push-back when creating a data warehouse, which supports appropriate data governance. In this instance a table was not ready to deploy to the data warehouse as a table column may need to be considered sensitive and handled appropriately. In this sample script, Auto DW’s engine understands the attribute is useful for analysis, but also may need to be considered sensitive. In this script the user will: 217 | 218 | 1) **Identify a Skipped Table** 219 | 220 | ```SQL 221 | /* Identify source tables skipped and not integration into the data warehouse. */ 222 | SELECT schema, "table", status, status_response 223 | FROM auto_dw.source_table() 224 | WHERE status_code = 'SKIP'; 225 | ``` 226 | 227 | > **Note:** Running this code will provide an understanding of which table was skipped along with a high level reason. You should see the following output from the status_response: “Source Table was skipped as column(s) need additional context. Please run the following SQL query for more information: SELECT schema, table, column, status, status_response FROM auto_dw.source_status_detail() WHERE schema = 'public' AND table = 'customers'.” 228 | 229 | 2) **Identify the Root Cause** 230 | 231 | ```SQL 232 | /* Identify the source table column that caused the problem, understand the issue, and potential solution. */ 233 | SELECT schema, "table", "column", status, confidence_level, status_response 234 | FROM auto_dw.source_column() 235 | WHERE schema = 'PUBLIC' AND "table" = 'CUSTOMER'; 236 | ``` 237 | 238 | > **Note:** Running this code will provide an understanding of which table column was skipped along with a reason in the status_response. You should see the following output: “Requires Attention: Column cannot be appropriately categorized as it may contain sensitive data. Specifically, if the zip is an extended zip it may be considered PII.” 239 | 240 | 3) **Decide to Institute Some Data Governance Best Practices** 241 | 242 | ```SQL 243 | /* Altering column length restricts the acceptance of extended ZIP codes.*/ 244 | ALTER TABLE customer ALTER COLUMN zip TYPE VARCHAR(5); 245 | ``` 246 | 247 | > **Note:** Here the choice was up to the user to make a change that facilitated LLM understanding of data sensitivity. In this case, limiting the type to VARCHAR(5) will allow the LLM to understand that this column will not contain sensitive information in the future. 248 | 249 | ```mermaid 250 | flowchart LR 251 | Start(("Start")) --> tbl["Identify a Skipped Table\nauto_dw.source_table()"] 252 | tbl --> col["Identify the Root Cause\nauto_dw.source_column()"] 253 | col --> DW[("Institute Data Governance\nBest Practices")] 254 | DW --> Done(("Done")) 255 | ``` 256 | 257 | **Auto DW Process Flow:** The script highlighted in Act 2 demonstrates that there are several approaches to successfully implementing a data warehouse when using this extension. Below is a BPMN diagram that illustrates these various paths. 258 | 259 | ```mermaid 260 | flowchart LR 261 | subgraph functions_informative["Informative Functions"] 262 | direction LR 263 | health["auto_dw.health()"] 264 | source_tables["auto_dw.source_tables()"] 265 | source_column["auto_dw.source_column()"] 266 | end 267 | subgraph functions_interactive["Interactive Functions"] 268 | direction LR 269 | source_clude["auto_dw.source_include(object_pattern)"] 270 | update_context["auto_dw.update_context(object, context)"] 271 | go["auto_dw.go(flag, status)"] 272 | end 273 | subgraph data_gov["Data Governance"] 274 | direction BT 275 | to_gov{"X"} --> gov["Issue\nGovernance"] 276 | end 277 | start(("Start")) --> command["Choose Command"] 278 | command --> split{"X"} 279 | split --> health & source_tables & source_column & source_clude & update_context & go --> join{"X"} 280 | join --> review["Review Results"] 281 | review --> data_gov --> more_auto{"More\nAutomations?"} 282 | more_auto --> |no| done(("Done")) 283 | more_auto --> |yes| start_again(("Restart")) 284 | ``` 285 | 286 | -------------------------------------------------------------------------------- /extension/.cargo/config.toml: -------------------------------------------------------------------------------- 1 | [target.'cfg(target_os="macos")'] 2 | # Postgres symbols won't be available until runtime 3 | rustflags = ["-Clink-arg=-Wl,-undefined,dynamic_lookup"] 4 | -------------------------------------------------------------------------------- /extension/.gitignore: -------------------------------------------------------------------------------- 1 | /target -------------------------------------------------------------------------------- /extension/Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "pg_auto_dw" # Extension name 3 | version = "0.0.5" # Extension version (SemVer: MAJOR.MINOR.PATCH) 4 | edition = "2021" # Rust 2021 edition 5 | 6 | [lib] 7 | crate-type = ["cdylib", "lib"] 8 | 9 | [features] 10 | default = ["pg16"] 11 | pg14 = ["pgrx/pg14", "pgrx-tests/pg14" ] 12 | pg15 = ["pgrx/pg15", "pgrx-tests/pg15" ] 13 | pg16 = ["pgrx/pg16", "pgrx-tests/pg16" ] 14 | pg17 = ["pgrx/pg17", "pgrx-tests/pg17" ] 15 | pg_test = [] 16 | 17 | # Custom features 18 | experimental = [] 19 | 20 | [dependencies] 21 | pgrx = "0.12.9" 22 | serde = { version = "1.0", features = ["derive"] } 23 | serde_json = "1.0" 24 | reqwest = { version = "0.11", features = ["json"] } 25 | tokio = { version = "1", features = ["full"] } 26 | uuid = { version = "1.1", features = ["v4", "v5", "serde"] } 27 | chrono = { version = "0.4", features = ["serde"] } 28 | anyhow = "1.0" 29 | regex = "1.7" 30 | sha2 = "0.10" 31 | hex = "0.4" 32 | 33 | [dev-dependencies] 34 | pgrx-tests = "0.12.9" 35 | 36 | [profile.dev] 37 | panic = "unwind" 38 | 39 | [profile.release] 40 | panic = "unwind" 41 | opt-level = 3 42 | lto = "fat" 43 | codegen-units = 1 44 | -------------------------------------------------------------------------------- /extension/Trunk.toml: -------------------------------------------------------------------------------- 1 | [extension] 2 | name = "pg_auto_dw" 3 | version = "0.0.5" 4 | repository = "https://github.com/tembo-io/pg_auto_dw" 5 | license = "PostgreSQL" 6 | description = "An auto data warehouse extension for Postgres." 7 | homepage = "https://github.com/tembo-io/pg_auto_dw" 8 | documentation = "https://github.com/tembo-io/pg_auto_dw" 9 | categories = ["analytics", "orchestration"] 10 | loadable_libraries = [{ library_name = "pg_auto_dw", requires_restart = true }] 11 | 12 | [build] 13 | postgres_version = "15" 14 | platform = "linux/amd64" -------------------------------------------------------------------------------- /extension/assets/PG_AUTO_DW_LOGO.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tembo-io/pg_auto_dw/631e1946ebf8600f6459ddac16ddde58cfe0c646/extension/assets/PG_AUTO_DW_LOGO.png -------------------------------------------------------------------------------- /extension/docs/readme.md: -------------------------------------------------------------------------------- 1 | ## Documentation Hub 2 | A Guide to PG_AUTO_DW 3 | - [SQL Functions](sql_functions/readme.md) 4 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/go.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md##interactive-functions)** 3 | 4 | # GO
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | Initiates data warehouse builds and initiates dataflows. 6 | 7 | ## Syntax 8 | ``` SQL 9 | go(, ) 10 | ``` 11 | 12 | ## Usage Notes 13 | Use this function to build an entire data warehouse or push data from a single table into the built dw tables. 14 | 15 | ## Examples 16 | 17 | Build a Data Warehouse 18 | ```sql 19 | -- Builds a DW for all source tables that are ready-to-deploy. 20 | SELECT auto_dw.go('Build', 'RTD'); 21 | ``` 22 |     or 23 | ```sql 24 | -- Builds a DW for all source tables that are ready-to-deploy. 25 | SELECT auto_dw.go(); -- Runs the default, which builds for all tables that are ready-to-deploy. 26 | ``` 27 |
28 | 29 | Perform a Dry Run 30 | ```sql 31 | -- Build, Test, and Rollback DW automation for all source tables that are ready-to-deploy. 32 | SELECT auto_dw.go('DryRun', 'RTD'); 33 | ``` 34 | 35 |
36 | 37 | Push data from a table. 38 | ```sql 39 | -- Push Source TABLE MARKETING.PROSPECTS data to the DW. 40 | SELECT auto_dw.go('Push-Table', 'marketing.prospects'); 41 | ``` 42 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/health.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md#informative-functions)** 3 | 4 | # HEALTH
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | 6 | Returns a table indicating the health of all DW automations. 7 | 8 | ## Syntax 9 | ```sql 10 | health() 11 | ``` 12 | 13 | ## Usage Notes 14 | Use this function often to understand the state of your data warehouse. Results can be used to identify operational errors and data availability. 15 | 16 | ## Examples 17 | ```sql 18 | SELECT * FROM auto_dw.health(); 19 | ``` 20 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/readme.md: -------------------------------------------------------------------------------- 1 | ## SQL Function Documentation 2 | ![Status](https://img.shields.io/badge/status-draft-yellow) 3 | 4 | The following SQL functions provide the primary modality for interacting with the extension PG_AUTO_DW. Functions are broken into two categories: informative and interactive. Interactive functions can change the data warehouse (DW). 5 | 6 | ### Informative Functions 7 | These functions do not affect the database. 8 | | Availability | Function | Purpose | 9 | |--------------|---------------------------------------|-----------------------------------------------------------------------| 10 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`health()`](health.md) | Understand DW health. | 11 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_tables()`](source_tables.md) | Understand the status of all tables included for DW automation. | 12 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_columns()`](source_columns.md)| Understand the status of all table columns included for DW automation. | 13 | 14 | ### Interactive Functions 15 | These functions can only effect the data warehouse portion of the database. 16 | | Availability | Function | Purpose | 17 | |--------------|---------------------------------------|-----------------------------------------------------------------------| 18 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_include(object_pattern)`](source_include.md) | Add source objects for DW automation. | 19 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_exclude(object_pattern)`](source_exclude.md) | Remove source objects for DW automation. | 20 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`update_context(object, context)`](update_context.md) | Provide information to facilitate DW automation. | 21 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`go(flag, status)`](go.md) | Initiates DW builds and dataflows. | 22 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/source_columns.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md#informative-functions)** 3 | 4 | # SOURCE_COLUMNS
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | 6 | Returns a table indicating the status of all columns included for DW automation. 7 | 8 | ## Syntax 9 | ```sql 10 | source_columns() 11 | ``` 12 | 13 | ## Usage Notes 14 | Use this function to see the status of source columns in the DW automation process. Results can be used to identify table columns that require additional attention. 15 | 16 | ## Examples 17 | ```sql 18 | SELECT * FROM auto_dw.source_columns(); 19 | ``` 20 | 21 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/source_exclude.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md##interactive-functions)** 3 | 4 | # SOURCE_EXCLUDE
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | 6 | - Removes objects from the DW automation queue. 7 | - Returns a table indicating objects that have been removed from the DW automation queue.
 8 | 9 | ## Syntax 10 | ``` SQL 11 | source_exclude() 12 | ``` 13 | 14 | ## Usage Notes 15 | Use this function to remove SCHEMAS, TABLES, and COLUMNS from the DW automation queue. 16 | 17 | ## Examples 18 | 19 | All objects in the PUBLIC SCHEMA have been added by default. To remove SCHEMA PUBLIC issues the following statement. 20 | ```sql 21 | -- Remove PUBLIC SCHEMA and associated objects from the queue. 22 | SELECT * FROM auto_dw.source_exclude('PUBLIC'); 23 | ``` 24 |
25 | 26 | Remove COLUMN from TABLE MARKETING.PROSPECTS 27 | ```sql 28 | -- Remove attribute LAST_REACHED_TS 29 | SELECT * FROM auto_dw.source_exclude('marketing.prospects.last_reached_ts'); 30 | ``` 31 | **Note:** If automations warehoused this column, automations will not remove the associated column or data. 32 | 33 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/source_include.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md##interactive-functions)** 3 | 4 | # SOURCE_INCLUDE
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | 6 | - Adds objects to the DW automation queue. 7 | - Returns a table indicating objects that have been added to the DW automation queue.
 8 | 9 | ## Syntax 10 | ``` SQL 11 | source_include() 12 | ``` 13 | 14 | ## Usage Notes 15 | Use this function to add SCHEMAS, TABLES, and COLUMNS to the DW automation queue. If new attributes have been added to a table you may add them to the queue with this function. 16 | 17 | > **Note:** All objects in the PUBLIC schema are added by default upon extension creation. To remove see example in function source_exclude(). 18 | 19 | ## Examples 20 | 21 | Add TABLE ERROR_LOGS 22 | ```sql 23 | -- Adds all TABLE ERROR_LOGS COLUMNS to the queue. 24 | SELECT * FROM auto_dw.source_include('logging.error_logs.*'); 25 | ``` 26 | 27 | Add SCHEMA MARKETING 28 | ```sql 29 | -- Adds all TABLE and TABLE COLUMNS from SCHEMA MARKETING. 30 | SELECT * FROM auto_dw.source_include('marketing.*.*'); 31 | ``` 32 | 33 | Add new COLUMN from TABLE MARKETING.PROSPECTS 34 | ```sql 35 | -- Add attribute LAST_REACHED_TS 36 | SELECT * FROM auto_dw.source_include('marketing.prospects.last_reached_ts'); 37 | ``` 38 | 39 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/source_tables.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md#informative-functions)** 3 | 4 | # SOURCE_TABLES
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | 6 | Returns a table indicating the status of all tables included for DW automation. 7 | 8 | ## Syntax 9 | ```sql 10 | source_tables() 11 | ``` 12 | 13 | ## Usage Notes 14 | Use this function to see the status of source tables in the DW automation process. Results can be used to identify tables that require additional attention or to understand the DW build status. 15 | 16 | ## Examples 17 | ```sql 18 | SELECT * FROM auto_dw.source_tables(); 19 | ``` 20 | -------------------------------------------------------------------------------- /extension/docs/sql_functions/update_context.md: -------------------------------------------------------------------------------- 1 | ## Categories: 2 | **[SQL Function - Informative](readme.md##interactive-functions)** 3 | 4 | # UPDATE_CONTEXT
![Status](https://img.shields.io/badge/status-draft-yellow) 5 | Adds context to objects for DW automation processes. 6 | 7 | ## Syntax 8 | ``` SQL 9 | update_context(, ) 10 | ``` 11 | 12 | ## Usage Notes 13 | Use this function to add context to SCHEMAS, TABLES, and COLUMNS. 14 | 15 | ## Examples 16 | 17 | Adding a 4 AM Daily Schedule to TABLE ERROR_LOGS 18 | ```sql 19 | -- Adds all TABLE ERROR_LOGS COLUMNS to the queue. 20 | SELECT auto_dw.update_context('public.foo', '{"cron": "0 4 * * *"}' 21 | ``` 22 | 23 |
24 | 25 | Indicate that COLUMN ZIP does not contain sensitive information. 26 | ```sql 27 | SELECT auto_dw.update_context('PUBLIC.CUSTOMER.ZIP', {"sensitive": false}); 28 | ``` 29 | 30 | -------------------------------------------------------------------------------- /extension/pg_auto_dw.control: -------------------------------------------------------------------------------- 1 | comment = 'Extension to automatically create downstream data warehouse tables.' 2 | default_version = '@CARGO_VERSION@' 3 | module_pathname = '$libdir/pg_auto_dw' 4 | relocatable = false 5 | superuser = true 6 | schema = 'auto_dw' 7 | -------------------------------------------------------------------------------- /extension/src/bin/pgrx_embed_pg_auto_dw.rs: -------------------------------------------------------------------------------- 1 | ::pgrx::pgrx_embed!(); 2 | -------------------------------------------------------------------------------- /extension/src/controller/bgw_init.rs: -------------------------------------------------------------------------------- 1 | use pgrx::bgworkers::*; 2 | use pgrx::prelude::*; 3 | 4 | use crate::utility::guc; 5 | 6 | #[pg_guard] 7 | pub extern "C" fn _PG_init() { 8 | 9 | guc::init_guc(); 10 | 11 | let database_name_o = guc::get_guc(guc::PgAutoDWGuc::DatabaseName); 12 | 13 | match database_name_o { 14 | Some(_database_name) => { 15 | 16 | BackgroundWorkerBuilder::new("Background Worker Source Object Update") 17 | .set_function("background_worker_source_objects") 18 | .set_library("pg_auto_dw") 19 | .enable_spi_access() 20 | .load(); 21 | 22 | BackgroundWorkerBuilder::new("Background Worker Transformer Client") 23 | .set_function("background_worker_transformer_client") 24 | .set_library("pg_auto_dw") 25 | .enable_spi_access() 26 | .load(); 27 | } 28 | None => { 29 | log!("Database Name for this extension has not been set."); 30 | } 31 | } 32 | } 33 | 34 | 35 | 36 | 37 | 38 | 39 | -------------------------------------------------------------------------------- /extension/src/controller/bgw_source_objects.rs: -------------------------------------------------------------------------------- 1 | use pgrx::bgworkers::*; 2 | use pgrx::prelude::*; 3 | 4 | use std::time::Duration; 5 | 6 | use crate::queries; 7 | use crate::utility::guc; 8 | 9 | #[pg_guard] 10 | #[no_mangle] 11 | pub extern "C" fn background_worker_source_objects(_arg: pg_sys::Datum) { 12 | 13 | let optional_database_name = guc::get_guc(guc::PgAutoDWGuc::DatabaseName); 14 | 15 | BackgroundWorker::attach_signal_handlers(SignalWakeFlags::SIGHUP | SignalWakeFlags::SIGTERM); 16 | BackgroundWorker::connect_worker_to_spi(optional_database_name.as_deref(), None); 17 | 18 | while BackgroundWorker::wait_latch(Some(Duration::from_secs(10))) { 19 | let result: Result<(), pgrx::spi::Error> = BackgroundWorker::transaction(|| { 20 | Spi::connect(|mut client| { 21 | 22 | let table_check_results: Result = 23 | client.select("SELECT table_name FROM information_schema.tables WHERE table_schema = 'auto_dw' AND table_name = 'source_objects'", None, None); 24 | match table_check_results { 25 | Ok(table_check) => { 26 | if table_check.len() > 0 { 27 | client.update( 28 | queries::source_object_dw( 29 | "a^", 30 | "a^", 31 | "a^", 32 | "a^", 33 | "a^", 34 | "a^" 35 | ).as_str(), 36 | None, 37 | None, 38 | )?; 39 | } else { 40 | panic!("TABLE AUTO_DW.SOURCE_OBJECTS not found. PG_AUTO_DW Extension may need to be installed."); 41 | } 42 | }, 43 | Err(e) => { 44 | log!("Error checking TABLE AUTO_DW.SOURCE_OJBECTS: {:?}", e); 45 | } 46 | } 47 | Ok(()) 48 | }) 49 | }); 50 | result.unwrap_or_else(|e| panic!("got an error: {}", e)); 51 | } 52 | } -------------------------------------------------------------------------------- /extension/src/controller/bgw_transformer_client.rs: -------------------------------------------------------------------------------- 1 | use pgrx::bgworkers::*; 2 | use pgrx::{prelude::*, pg_sys::Oid}; 3 | 4 | use std::time::Duration; 5 | use std::collections::HashMap; 6 | use tokio::runtime::Runtime; 7 | use tokio::time::sleep; 8 | use serde::Deserialize; 9 | 10 | use crate::model::*; 11 | use crate::utility::transformer_client; 12 | use crate::utility::guc; 13 | use regex::Regex; 14 | 15 | #[pg_guard] 16 | #[no_mangle] 17 | pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) { 18 | 19 | let max_transformer_retries = guc::get_guc(guc::PgAutoDWGuc::TransformerServerMaxRetries).unwrap(); 20 | let max_transformer_retries: u64 = max_transformer_retries.parse().expect("TransformerServerMaxRetries Not Valid u64"); 21 | 22 | let database_name_string = guc::get_guc(guc::PgAutoDWGuc::DatabaseName); 23 | let database_name_o: Option<&str> = database_name_string.as_deref(); 24 | 25 | BackgroundWorker::attach_signal_handlers(SignalWakeFlags::SIGHUP | SignalWakeFlags::SIGTERM); 26 | BackgroundWorker::connect_worker_to_spi(database_name_o, None); 27 | 28 | // Initialize Tokio runtime 29 | let runtime = Runtime::new().expect("Failed to create Tokio runtime"); 30 | 31 | while BackgroundWorker::wait_latch(Some(Duration::from_secs(10))) { 32 | 33 | extension_log("BGWorker: Transformer Client", "INFO", "Beginning Transformer Background Process."); 34 | 35 | // Load Prompts into Results 36 | let result: Result, pgrx::spi::Error> = BackgroundWorker::transaction(|| { 37 | Spi::connect(|client| { 38 | let source_objects_json = client.select(queries::SOURCE_OBJECTS_JSON, None, None)?; 39 | let mut v_source_table_prompts: Vec = Vec::new(); 40 | for source_object_json in source_objects_json { 41 | 42 | let table_oid = source_object_json.get_datum_by_ordinal(1)?.value::()?.unwrap(); 43 | let table_column_links = source_object_json.get_datum_by_ordinal(2)?.value::()?.unwrap(); 44 | let table_details = source_object_json.get_datum_by_ordinal(3)?.value::()?.unwrap(); 45 | 46 | let source_table_prompt = source_objects::SourceTablePrompt{ 47 | key: table_oid, 48 | table_column_links: table_column_links, 49 | table_details: table_details 50 | }; 51 | v_source_table_prompts.push(source_table_prompt) 52 | } 53 | Ok(v_source_table_prompts) 54 | }) 55 | }); 56 | 57 | // Get Prompts for Processing 58 | let v_source_table_prompts = result.unwrap_or_else(|e| panic!("got an error: {}", e)); 59 | 60 | // Process Each Prompt 61 | for source_table_prompt in v_source_table_prompts { 62 | log!("Starting Loop for Table Processing."); 63 | let table_details_json_str = serde_json::to_string_pretty(&source_table_prompt.table_details).expect("Failed to convert JSON Table Details to pretty string"); 64 | 65 | let table_column_link_json_str = serde_json::to_string_pretty(&source_table_prompt.table_column_links).expect("Failed to convert JSON Column Links to pretty string"); 66 | let table_column_links_o: Option = serde_json::from_str(&table_column_link_json_str).ok(); 67 | 68 | let columns = extract_column_numbers(&table_details_json_str); 69 | 70 | // Table Business Key Component Identification 71 | let mut generation_json_business_key_component_identification: Option = None; 72 | let mut generation_json_business_key_name: Option = None; 73 | let mut business_key_component_identification: HashMap<&u32, BusinessKeyComponentIdentification> = HashMap::new(); 74 | let mut business_key_name: HashMap<&u32, BusinessKeyName> = HashMap::new(); 75 | 76 | // Evaluate Attributes 77 | for column in &columns { 78 | let mut retries = 0; 79 | let mut hints = String::new(); 80 | 81 | while retries < max_transformer_retries { 82 | runtime.block_on(async { 83 | generation_json_business_key_component_identification = 84 | match transformer_client::send_request( 85 | table_details_json_str.as_str(), 86 | prompt_template::PromptTemplate::BKComponentIdentification, 87 | column, 88 | &hints).await { 89 | Ok(response_json) => { 90 | Some(response_json) 91 | }, 92 | Err(e) => { 93 | log!("Error in transformer request, BKComponentIdentification, malformed or timed out: {}", e); 94 | hints = format!("Hint: Please ensure you provide a JSON response only. This is your {} attempt and in that attept the following error is was given {e}.", retries + 1); 95 | 96 | log!("Delaying {retries} seconds for retry #{retries}."); 97 | sleep(Duration::from_secs(retries)).await; 98 | 99 | None 100 | } 101 | }; 102 | }); 103 | 104 | if generation_json_business_key_component_identification.is_none() { 105 | retries += 1; 106 | 107 | if retries >= max_transformer_retries { 108 | panic!("Max Transformer Retries Reached - restart backgrounder.") 109 | } 110 | 111 | // Skip to the next iteration 112 | continue; 113 | } 114 | 115 | match serde_json::from_value::(generation_json_business_key_component_identification.clone().unwrap()) { 116 | Ok(bki) => { 117 | business_key_component_identification.insert(column, bki); 118 | break; // Successfully Decoded 119 | } 120 | Err(e) => { 121 | log!("Error JSON JSON Structure not of type DescriptorSensitive: {}", e); 122 | } 123 | } 124 | retries += 1; 125 | log!("Transformer Retry No: {retries}"); 126 | } 127 | } 128 | 129 | // Generate Name if Identified as BK 130 | for column in &columns { 131 | let mut retries = 0; 132 | let mut hints = String::new(); 133 | 134 | match business_key_component_identification.get(column) { 135 | Some(bkci) => { 136 | if bkci.business_key_component_identification.is_business_key_component { 137 | // Identify BK Name 138 | while retries < max_transformer_retries { 139 | runtime.block_on(async { 140 | generation_json_business_key_name = 141 | match transformer_client::send_request(table_details_json_str.as_str(), prompt_template::PromptTemplate::BKName, &column, &hints).await { 142 | Ok(response_json) => { 143 | Some(response_json) 144 | }, 145 | Err(e) => { 146 | log!("Error in transformer request, BKName, malformed or timed out: {}", e); 147 | hints = format!("Hint: Please ensure you provide a JSON response only. This is your {} attempt and in that attept the following error is was given {e}.", retries + 1); 148 | 149 | log!("Delaying {retries} seconds for retry #{retries}."); 150 | sleep(Duration::from_secs(retries)).await; 151 | 152 | None 153 | } 154 | }; 155 | }); 156 | 157 | if generation_json_business_key_name.is_none() { 158 | retries += 1; 159 | 160 | if retries >= max_transformer_retries { 161 | panic!("Max Transformer Retries Reached - restart backgrounder.") 162 | } 163 | 164 | // Skip to the next iteration 165 | continue; 166 | } 167 | 168 | match serde_json::from_value::(generation_json_business_key_name.clone().unwrap()) { 169 | Ok(bkn) => { 170 | business_key_name.insert(column, bkn); 171 | break; // Successfully Decoded 172 | } 173 | Err(e) => { 174 | log!("Error JSON JSON Structure not of type BusinessKeyName: {}", e); 175 | } 176 | } 177 | 178 | retries += 1; 179 | } 180 | } else { 181 | continue; // Go do next column 182 | } 183 | } 184 | None => panic!("All columns should have been checked for business keys. No BusinessKeyComponetIdentification Struct Found."), 185 | } 186 | } 187 | 188 | // Identity Descriptor - Sensitive 189 | // let mut generation_json_descriptors_sensitive: HashMap<&u32, Option> = HashMap::new(); 190 | let mut descriptors_sensitive: HashMap<&u32, DescriptorSensitive> = HashMap::new(); 191 | let mut generation_json_descriptor_sensitive: Option = None; 192 | for column in &columns { 193 | let mut retries = 0; 194 | let mut hints = String::new(); 195 | while retries < max_transformer_retries { 196 | // Run the async block 197 | runtime.block_on(async { 198 | // Get Generation 199 | generation_json_descriptor_sensitive = 200 | match transformer_client::send_request( 201 | table_details_json_str.as_str(), 202 | prompt_template::PromptTemplate::DescriptorSensitive, 203 | column, 204 | &hints).await { 205 | Ok(response_json) => { 206 | Some(response_json) 207 | }, 208 | Err(e) => { 209 | log!("Error in transformer request, DescriptorSensitive, malformed or timed out: {}", e); 210 | hints = format!("Hint: Please ensure you provide a JSON response only. This is your {} attempt and in that attept the following error is was given {e}.", retries + 1); 211 | 212 | log!("Delaying {retries} seconds for retry #{retries}."); 213 | sleep(Duration::from_secs(retries)).await; 214 | 215 | None 216 | } 217 | }; 218 | // generation_json_descriptors_sensitive.insert(column, generation_json_descriptor_sensitive); 219 | }); 220 | 221 | if generation_json_descriptor_sensitive.is_none() { 222 | retries += 1; 223 | 224 | if retries >= max_transformer_retries { 225 | panic!("Max Transformer Retries Reached - restart backgrounder.") 226 | } 227 | 228 | // Skip to the next iteration 229 | continue; 230 | } 231 | 232 | match serde_json::from_value::(generation_json_descriptor_sensitive.clone().unwrap()) { 233 | Ok(des) => { 234 | // business_key_name_opt = Some(des); 235 | descriptors_sensitive.insert(column, des); 236 | break; // Successfully Decoded 237 | } 238 | Err(e) => { 239 | log!("Error JSON JSON Structure not of type DescriptorSensitive: {}", e); 240 | } 241 | } 242 | 243 | retries += 1; 244 | } 245 | } 246 | 247 | let table_column_links = table_column_links_o.unwrap(); 248 | 249 | // Build the SQL INSERT statement 250 | let mut insert_sql = String::from("INSERT INTO auto_dw.transformer_responses (fk_source_objects, model_name, category, business_key_name, confidence_score, reason) VALUES "); 251 | 252 | for (index, column) in columns.iter().enumerate() { 253 | 254 | let last = {index == table_column_links.column_links.len() - 1}; 255 | 256 | match (business_key_component_identification.get(column), business_key_name.get(column)) { 257 | (Some(business_key_component_identification), Some(business_key_name)) => { 258 | let category = "Business Key Part"; 259 | // Calculate the overall confidence score by taking the minimum of the confidence values 260 | // for the identified business key and the business key name. This approach is chosen to 261 | // ensure that the overall confidence reflects the weakest link, avoiding inflation of 262 | // the confidence score when one value is significantly lower than the other. 263 | let confidence_score = 264 | business_key_component_identification.business_key_component_identification.confidence_value.min( 265 | business_key_name.business_key_name_values.confidence_value); 266 | let bk_name = &business_key_name.business_key_name_values.name; 267 | let bk_identified_reason = &business_key_component_identification.business_key_component_identification.reason; 268 | let bk_name_reason = &business_key_name.business_key_name_values.reason; 269 | let reason = format!("BK Identified Reason: {}, BK Naming Reason: {}", bk_identified_reason, bk_name_reason); 270 | let model_name_owned = guc::get_guc(guc::PgAutoDWGuc::Model).expect("MODEL GUC is not set."); 271 | let model_name = model_name_owned.as_str(); 272 | 273 | let pk_source_objects: i32; 274 | 275 | if let Some(pk_source_objects_temp) = table_column_links.find_pk_source_objects(column.clone() as i32) { 276 | pk_source_objects = pk_source_objects_temp; 277 | } else { 278 | println!("No match found for column_ordinal_position: {}", column); 279 | panic!() 280 | } 281 | 282 | if !last { 283 | insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); 284 | } else { 285 | insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); 286 | } 287 | 288 | } 289 | _ => { // Not Identified as BKs 290 | let pk_source_objects: i32; 291 | let mut category = "Descriptor"; 292 | let mut confidence_score: f64 = 1.0; 293 | let bk_name = "NA"; 294 | let mut reason = "Defaulted of category 'Descriptor' maintained.".to_string(); 295 | let model_name_owned = guc::get_guc(guc::PgAutoDWGuc::Model).expect("MODEL GUC is not set."); 296 | let model_name = model_name_owned.as_str(); 297 | 298 | if let Some(pk_source_objects_temp) = table_column_links.find_pk_source_objects(column.clone() as i32) { 299 | pk_source_objects = pk_source_objects_temp; 300 | } else { 301 | println!("No match found for column_ordinal_position: {}", column); 302 | panic!() 303 | } 304 | 305 | if let Some(descriptor_sensitive) = descriptors_sensitive.get(&column) { 306 | if descriptor_sensitive.descriptor_sensitive_values.is_pii && (descriptor_sensitive.descriptor_sensitive_values.confidence_value > 0.5) { 307 | category = "Descriptor - Sensitive"; 308 | confidence_score = descriptor_sensitive.descriptor_sensitive_values.confidence_value; 309 | reason = descriptor_sensitive.descriptor_sensitive_values.reason.clone(); 310 | } 311 | } else { 312 | log!("Teseting Can't find a response for {} in Descriptors Sensitive Hashmap.", column); 313 | } 314 | 315 | if !last { 316 | insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); 317 | } else { 318 | insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''"))); 319 | } 320 | } 321 | } 322 | } 323 | 324 | // Push Generation to TABLE TRANSFORMER_RESPONSES 325 | BackgroundWorker::transaction(|| { 326 | Spi::connect(|mut client| { 327 | _ = client.update(insert_sql.as_str(), None, None); 328 | }) 329 | }); 330 | } 331 | 332 | } 333 | } 334 | 335 | fn extension_log(process: &str, level: &str, message: &str) { 336 | 337 | let insert_statement = format!(r#" 338 | INSERT INTO auto_dw.log (process, level, message) 339 | VALUES ('{}', '{}', '{}'); 340 | "#, process, level, message); 341 | 342 | BackgroundWorker::transaction(|| { 343 | Spi::connect(|mut client| { 344 | _ = client.update(insert_statement.as_str(), None, None); 345 | }) 346 | }); 347 | } 348 | 349 | fn extract_column_numbers(json_str: &str) -> Vec { 350 | // Define a regex to capture the column numbers 351 | let re = Regex::new(r"Column No: (\d+)").expect("Invalid regex"); 352 | 353 | // Find all matches and collect the column numbers 354 | re.captures_iter(json_str) 355 | .filter_map(|caps| caps.get(1).map(|m| m.as_str().parse::().unwrap())) 356 | .collect() 357 | } 358 | 359 | #[derive(Deserialize, Debug)] 360 | enum TableClassificationType { 361 | Hub, 362 | Link, 363 | } 364 | 365 | #[derive(Deserialize, Debug)] 366 | struct BusinessKeyComponentIdentification { 367 | #[serde(rename = "Business Key Component Identification")] 368 | business_key_component_identification: BusinessKeyComponentIdentificationValues, 369 | } 370 | 371 | #[derive(Deserialize, Debug)] 372 | struct BusinessKeyComponentIdentificationValues { 373 | #[serde(rename = "Is Business Key Component")] 374 | is_business_key_component: bool, 375 | #[serde(rename = "Confidence Value")] 376 | confidence_value: f64, 377 | #[serde(rename = "Reason")] 378 | reason: String, 379 | } 380 | 381 | #[derive(Deserialize, Debug)] 382 | struct BusinessKeyName { 383 | #[serde(rename = "Business Key Name")] 384 | business_key_name_values: BusinessKeyNameValues, 385 | } 386 | 387 | #[derive(Deserialize, Debug)] 388 | struct BusinessKeyNameValues { 389 | #[serde(rename = "Name")] 390 | name: String, 391 | #[serde(rename = "Confidence Value")] 392 | confidence_value: f64, 393 | #[serde(rename = "Reason")] 394 | reason: String, 395 | } 396 | 397 | #[derive(Deserialize, Debug)] 398 | struct DescriptorSensitive { 399 | #[serde(rename = "Descriptor - Sensitive")] 400 | descriptor_sensitive_values: DescriptorSensitiveValues, 401 | } 402 | 403 | #[derive(Deserialize, Debug)] 404 | struct DescriptorSensitiveValues { 405 | #[serde(rename = "Is PII")] 406 | is_pii: bool, 407 | #[serde(rename = "Confidence Value")] 408 | confidence_value: f64, 409 | #[serde(rename = "Reason")] 410 | reason: String, 411 | } 412 | 413 | -------------------------------------------------------------------------------- /extension/src/controller/dv_builder.rs: -------------------------------------------------------------------------------- 1 | use pgrx::{prelude::*, pg_sys::Oid}; 2 | use uuid::Uuid; 3 | use std::collections::HashMap; 4 | use chrono::Utc; 5 | 6 | use crate::model::queries; 7 | use crate::utility::guc; 8 | use crate::model::dv_schema::{ 9 | DVSchema, 10 | LinkKey, 11 | BusinessKey, 12 | BusinessKeyPartLink, 13 | Descriptor, 14 | DescriptorLink, 15 | ColumnData 16 | }; 17 | 18 | use super::dv_loader::*; 19 | 20 | pub fn build_dv(build_id: Uuid, dv_objects_query: &str, load_data: bool) { 21 | 22 | let mut dv_objects_hm: HashMap> = HashMap::new(); 23 | 24 | Spi::connect(|client| 25 | { 26 | let dv_objects_result = client.select(dv_objects_query, None, None); 27 | 28 | match dv_objects_result { 29 | 30 | Ok(dv_objects) => { 31 | 32 | for dv_object in dv_objects { 33 | 34 | let schema_name = dv_object.get_datum_by_ordinal(1).unwrap().value::().unwrap().unwrap(); 35 | let table_name = dv_object.get_datum_by_ordinal(2).unwrap().value::().unwrap().unwrap(); 36 | let column_category = dv_object.get_datum_by_ordinal(3).unwrap().value::().unwrap().unwrap(); 37 | let business_key_name = dv_object.get_datum_by_ordinal(4).unwrap().value::().unwrap().unwrap(); 38 | let column_name = dv_object.get_datum_by_ordinal(5).unwrap().value::().unwrap().unwrap(); 39 | let column_type_name = dv_object.get_datum_by_ordinal(6).unwrap().value::().unwrap().unwrap(); 40 | let system_id = dv_object.get_datum_by_ordinal(7).unwrap().value::().unwrap().unwrap(); 41 | let table_oid: Oid = dv_object.get_datum_by_ordinal(8).unwrap().value::().unwrap().unwrap(); 42 | let column_ordinal_position = dv_object.get_datum_by_ordinal(9).unwrap().value::().unwrap().unwrap(); 43 | 44 | let column_category = ColumnCategory::from_str(&column_category); 45 | 46 | let transformer_object: TransformerObject = 47 | TransformerObject { 48 | schema_name, 49 | table_name, 50 | business_key_name, 51 | column_name, 52 | column_type_name, 53 | system_id, 54 | table_oid, 55 | column_ordinal_position, 56 | column_category, 57 | }; 58 | 59 | // Bucket TransformerObject by table 60 | dv_objects_hm 61 | .entry(table_oid) 62 | .or_insert_with(Vec::new) 63 | .push(transformer_object); 64 | 65 | } 66 | } 67 | 68 | Err(e) => { 69 | log!("Error getting DV Transformer Objects Result: {:?}", e); 70 | } 71 | } 72 | } 73 | ); 74 | 75 | // Ensure ordering based on column ordinality for consistent processing. 76 | for dv_object in dv_objects_hm.values_mut() { 77 | dv_object.sort_by_key(|dv_object| dv_object.column_ordinal_position); 78 | } 79 | 80 | let ( 81 | dv_objects_hm_single_bkp, 82 | dv_objects_hm_multiple_bkp, 83 | ) = separate_by_business_parts(dv_objects_hm); 84 | 85 | // Build a Vector of LinkKeys 86 | let mut link_keys: Vec = Vec::new(); 87 | 88 | for dv_objects_v in dv_objects_hm_multiple_bkp { 89 | 90 | let mut descriptors: Vec = Vec::new(); 91 | 92 | // Build Descriptors 93 | for dv_object in &dv_objects_v.1 { 94 | 95 | let column_data_id = Uuid::new_v4(); 96 | 97 | let column_data = ColumnData { 98 | id: column_data_id, 99 | system_id: dv_object.system_id, 100 | schema_name: dv_object.schema_name.clone(), 101 | table_oid: dv_object.table_oid, 102 | table_name: dv_object.table_name.clone(), 103 | column_name: dv_object.column_name.clone(), 104 | column_ordinal_position: dv_object.column_ordinal_position, 105 | column_type_name: dv_object.column_type_name.clone(), 106 | }; 107 | let orbit = dv_object.table_name.clone(); 108 | 109 | if dv_object.column_category == ColumnCategory::Descriptor { 110 | let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, false); 111 | descriptors.push(descriptor); 112 | } else if dv_object.column_category == ColumnCategory::DescriptorSensitive { 113 | let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, true); 114 | descriptors.push(descriptor); 115 | } 116 | } 117 | 118 | let mut business_keys: Vec = Vec::new(); 119 | // Build Business keys 120 | for dv_object in &dv_objects_v.1 { 121 | 122 | if dv_object.column_category == ColumnCategory::BusinessKeyPart { 123 | let column_data_id = Uuid::new_v4(); 124 | 125 | let column_data = ColumnData { 126 | id: column_data_id, 127 | system_id: dv_object.system_id, 128 | schema_name: dv_object.schema_name.clone(), 129 | table_oid: dv_object.table_oid, 130 | table_name: dv_object.table_name.clone(), 131 | column_name: dv_object.column_name.clone(), 132 | column_ordinal_position: dv_object.column_ordinal_position, 133 | column_type_name: dv_object.column_type_name.clone(), 134 | }; 135 | 136 | let mut business_key_part_links: Vec = Vec::new(); 137 | 138 | if dv_object.column_category == ColumnCategory::BusinessKeyPart { 139 | // Alias good for 1 BKP, Refactor for many BKPs 140 | let business_key_part_link = get_business_key_part_link(dv_object.business_key_name.clone(), column_data); 141 | business_key_part_links.push(business_key_part_link); 142 | } 143 | 144 | let business_key_name = dv_object.business_key_name.to_lowercase().clone(); 145 | 146 | let business_key_id = Uuid::new_v4(); 147 | 148 | let business_key = BusinessKey { 149 | id: business_key_id, 150 | name: business_key_name, 151 | business_key_part_links, 152 | descriptors: Vec::new(), // Descriptors place on Link Key not Business Key 153 | }; 154 | 155 | business_keys.push(business_key); 156 | } 157 | } 158 | 159 | let link_key_name = business_keys 160 | .iter() 161 | .map(|bk| bk.name.as_str()) 162 | .collect::>() 163 | .join("_"); 164 | 165 | let link_key_id = Uuid::new_v4(); 166 | 167 | let link_key = LinkKey { 168 | id: link_key_id, 169 | name: link_key_name, 170 | business_keys, 171 | descriptors, 172 | }; 173 | 174 | link_keys.push(link_key); 175 | } 176 | 177 | // Shadowing to remove mutability. 178 | let link_keys = link_keys; 179 | 180 | // Build a Vector of BusinessKeys 181 | let mut business_keys: Vec = Vec::new(); 182 | for dv_objects_v in dv_objects_hm_single_bkp { 183 | 184 | let mut descriptors: Vec = Vec::new(); 185 | let mut business_key_part_links: Vec = Vec::new(); 186 | 187 | // Build Descriptors 188 | for dv_object in &dv_objects_v.1 { 189 | 190 | let column_data_id = Uuid::new_v4(); 191 | 192 | let column_data = ColumnData { 193 | id: column_data_id, 194 | system_id: dv_object.system_id, 195 | schema_name: dv_object.schema_name.clone(), 196 | table_oid: dv_object.table_oid, 197 | table_name: dv_object.table_name.clone(), 198 | column_name: dv_object.column_name.clone(), 199 | column_ordinal_position: dv_object.column_ordinal_position, 200 | column_type_name: dv_object.column_type_name.clone(), 201 | }; 202 | let orbit = dv_object.table_name.clone(); 203 | 204 | if dv_object.column_category == ColumnCategory::Descriptor { 205 | let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, false); 206 | descriptors.push(descriptor); 207 | } else if dv_object.column_category == ColumnCategory::DescriptorSensitive { 208 | let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, true); 209 | descriptors.push(descriptor); 210 | } 211 | } 212 | 213 | // Build Business Key Part Links 214 | for dv_object in &dv_objects_v.1 { 215 | 216 | let column_data_id = Uuid::new_v4(); 217 | 218 | let column_data = ColumnData { 219 | id: column_data_id, 220 | system_id: dv_object.system_id, 221 | schema_name: dv_object.schema_name.clone(), 222 | table_oid: dv_object.table_oid, 223 | table_name: dv_object.table_name.clone(), 224 | column_name: dv_object.column_name.clone(), 225 | column_ordinal_position: dv_object.column_ordinal_position, 226 | column_type_name: dv_object.column_type_name.clone(), 227 | }; 228 | 229 | if dv_object.column_category == ColumnCategory::BusinessKeyPart { 230 | // Alias good for 1 BKP, Refactor for many BKPs 231 | let business_key_part_link = get_business_key_part_link(dv_object.business_key_name.clone(), column_data); 232 | business_key_part_links.push(business_key_part_link); 233 | } 234 | } 235 | 236 | // TODO: Handle multiple business keys for link tables. Ensure appropriate error handling! 237 | let business_key_name: String = { 238 | let mut business_key_name = String::new(); 239 | for dv_object in &dv_objects_v.1 { 240 | if dv_object.business_key_name.to_lowercase() != "na" { 241 | business_key_name = dv_object.business_key_name.to_lowercase().clone(); 242 | } 243 | } 244 | business_key_name 245 | }; 246 | 247 | let business_key_id = Uuid::new_v4(); 248 | let business_key = BusinessKey { 249 | id: business_key_id, 250 | name: business_key_name, 251 | business_key_part_links, 252 | descriptors 253 | }; 254 | 255 | business_keys.push(business_key); 256 | } 257 | 258 | // Shadowing to remove mutability. 259 | let business_keys = business_keys; 260 | 261 | let dw_schema = guc::get_guc(guc::PgAutoDWGuc::DwSchema).expect("DW SCHEMA GUC is not set."); 262 | 263 | // Build DV 264 | // Push DV Function 265 | let mut dv_ddl_sql = String::new(); 266 | 267 | for business_key in &business_keys { 268 | let dv_business_key_ddl_sql = build_sql_from_business_key(&dw_schema, business_key); 269 | dv_ddl_sql.push_str(&dv_business_key_ddl_sql); 270 | } 271 | 272 | for link_key in &link_keys { 273 | let dv_link_key_ddl_sql = build_sql_from_link_key(&dw_schema, link_key); 274 | dv_ddl_sql.push_str(&dv_link_key_ddl_sql); 275 | } 276 | 277 | log!("Running DV DDL: {}", dv_ddl_sql); 278 | 279 | // Build Tables using DDL 280 | Spi::connect( |mut client| { 281 | _ = client.update(&dv_ddl_sql, None, None); 282 | log!("DV Tables Built"); 283 | } 284 | ); 285 | 286 | // Build DVTransformerSchema 287 | 288 | // Get the current time in GMT 289 | let now_gmt = Utc::now().naive_utc(); 290 | 291 | let mut dv_schema = DVSchema { 292 | id: build_id, 293 | dw_schema, 294 | create_timestamp_gmt: now_gmt, 295 | modified_timestamp_gmt: now_gmt, 296 | business_keys, 297 | link_keys, 298 | }; 299 | 300 | // Add Target Columns to dv_schema links. 301 | 302 | dv_schema_add_target_columns(&mut dv_schema); 303 | 304 | log!("DV Schema JSON: {:#?}", dv_schema); 305 | 306 | dv_schema_push_to_repo(&build_id.to_string(), &mut dv_schema); 307 | 308 | // ToDo: Remove as this is redundant and for testing purposes. However, this function will be integral for future data refreshes. 309 | match dv_load_schema_from_build_id(&build_id.to_string()) { 310 | Some(schema) => { 311 | dv_schema = schema; 312 | } 313 | None => { 314 | panic!("Repo Error") 315 | } 316 | }; 317 | 318 | insert_dw_source_columns(&dv_schema); 319 | 320 | if load_data {dv_data_loader(&dv_schema);} 321 | 322 | } 323 | 324 | fn insert_dw_source_columns(dv_schema: &DVSchema) { 325 | 326 | let insert_dw_source_column: &str = r#" 327 | INSERT INTO auto_dw.dw_source_objects (table_oid, column_ordinal_position) 328 | VALUES ($1, $2) 329 | "#; 330 | 331 | for column in dv_schema.get_columns() { 332 | let table_oid = column.0; 333 | let column_ordinal_position = column.1; 334 | 335 | log!("DV'd Table: {}, Col: {column_ordinal_position}", table_oid.as_u32()); 336 | 337 | Spi::connect( |mut client| { 338 | _ = client.update(insert_dw_source_column, None, 339 | Some(vec![ 340 | (PgOid::from(pg_sys::OIDOID), table_oid.into_datum()), 341 | (PgOid::from(pg_sys::INT2OID), column_ordinal_position.into_datum()), 342 | ])); 343 | } 344 | ); 345 | } 346 | } 347 | 348 | fn dv_schema_push_to_repo(build_id: &String, dv_schema: &mut DVSchema) { 349 | 350 | let now_gmt = Utc::now().naive_utc(); 351 | 352 | dv_schema.modified_timestamp_gmt = now_gmt; 353 | 354 | let insert_schema_query: &str = r#" 355 | INSERT INTO auto_dw.dv_repo (build_id, schema) 356 | VALUES ($1, $2) 357 | "#; 358 | 359 | let repo_json_string = serde_json::to_string(dv_schema).unwrap(); 360 | 361 | // Build Tables using DDL 362 | Spi::connect( |mut client| { 363 | _ = client.update(insert_schema_query, None, 364 | Some(vec![ 365 | (PgOid::from(pg_sys::TEXTOID), build_id.into_datum()), 366 | (PgOid::from(pg_sys::JSONOID), repo_json_string.into_datum()), 367 | ])); 368 | } 369 | ); 370 | 371 | } 372 | 373 | fn dv_schema_add_target_columns(dv_schema: &mut DVSchema) { 374 | 375 | for link_key in &mut dv_schema.link_keys { 376 | for descriptor in &mut link_key.descriptors { 377 | descriptor_add_target_columns(&dv_schema.dw_schema, descriptor); 378 | } 379 | 380 | for business_key in &mut link_key.business_keys { 381 | for business_key_part_link in &mut business_key.business_key_part_links { 382 | business_key_part_link_add_hub_target_column(&dv_schema.dw_schema,&business_key.name, business_key_part_link); 383 | } 384 | } 385 | } 386 | 387 | for business_key in &mut dv_schema.business_keys { 388 | 389 | // For Descriptors in Business Keys 390 | for descriptor in &mut business_key.descriptors { 391 | descriptor_add_target_columns(&dv_schema.dw_schema, descriptor); 392 | } 393 | 394 | // For Business Key Parts in Business Keys 395 | for business_key_part_link in &mut business_key.business_key_part_links { 396 | business_key_part_link_add_hub_target_column(&dv_schema.dw_schema,&business_key.name, business_key_part_link); 397 | } 398 | 399 | } 400 | } 401 | 402 | fn business_key_part_link_add_hub_target_column(schema_name: &String, business_key_name: &String, business_key_part_link: &mut BusinessKeyPartLink ) { 403 | 404 | let table_name = &{"hub_".to_string() + business_key_name}; 405 | let column_name = &(business_key_part_link.alias.clone() + "_bk"); 406 | 407 | let get_column_data= queries::get_column_data(schema_name, table_name, column_name); 408 | 409 | let column_data: Option = Spi::connect( |client| { 410 | 411 | match client.select(&get_column_data, None, None) { 412 | Ok(column_data) => { 413 | // Only 0 or 1 record should be returned. 414 | if let Some(column_data_record) = column_data.into_iter().next() { 415 | let system_id = column_data_record.get_datum_by_ordinal(1).unwrap().value::().unwrap().unwrap(); 416 | let _schema_oid = column_data_record.get_datum_by_ordinal(2).unwrap().value::().unwrap().unwrap(); 417 | let schema_name = column_data_record.get_datum_by_ordinal(3).unwrap().value::().unwrap().unwrap(); 418 | let table_name = column_data_record.get_datum_by_ordinal(4).unwrap().value::().unwrap().unwrap(); 419 | let table_oid = column_data_record.get_datum_by_ordinal(5).unwrap().value::().unwrap().unwrap(); 420 | let column_name = column_data_record.get_datum_by_ordinal(6).unwrap().value::().unwrap().unwrap(); 421 | let column_ordinal_position = column_data_record.get_datum_by_ordinal(7).unwrap().value::().unwrap().unwrap(); 422 | let column_type_name = column_data_record.get_datum_by_ordinal(8).unwrap().value::().unwrap().unwrap(); 423 | 424 | let column_id = Uuid::new_v4(); 425 | 426 | let column_data = ColumnData { 427 | id: column_id, 428 | system_id, 429 | schema_name, 430 | table_oid, 431 | table_name, 432 | column_name, 433 | column_ordinal_position, 434 | column_type_name, 435 | }; 436 | 437 | return Some(column_data) 438 | 439 | } else { 440 | log!("Column Data Not available."); 441 | 442 | } 443 | return None 444 | } 445 | Err(e) => { 446 | log!("Target Column Data Error: {:?}", e); 447 | return None 448 | } 449 | } 450 | }); 451 | 452 | business_key_part_link.hub_target_column = column_data; 453 | 454 | } 455 | 456 | fn descriptor_add_target_columns(schema_name: &String, descriptor: &mut Descriptor ) { 457 | 458 | let table_name = &{"sat_".to_string() + &descriptor.orbit + {if descriptor.is_sensitive { "_sensitive" } else {""}}}; 459 | let column_name = &descriptor.descriptor_link.alias; 460 | 461 | let get_column_data = queries::get_column_data(schema_name, table_name, column_name); 462 | 463 | let column_data: Option = Spi::connect( |client| { 464 | 465 | match client.select(&get_column_data, None, None) { 466 | Ok(column_data) => { 467 | // Only 0 or 1 record should be returned. 468 | if let Some(column_data_record) = column_data.into_iter().next() { 469 | let system_id = column_data_record.get_datum_by_ordinal(1).unwrap().value::().unwrap().unwrap(); 470 | let _schema_oid = column_data_record.get_datum_by_ordinal(2).unwrap().value::().unwrap().unwrap(); 471 | let schema_name = column_data_record.get_datum_by_ordinal(3).unwrap().value::().unwrap().unwrap(); 472 | let table_name = column_data_record.get_datum_by_ordinal(4).unwrap().value::().unwrap().unwrap(); 473 | let table_oid = column_data_record.get_datum_by_ordinal(5).unwrap().value::().unwrap().unwrap(); 474 | let column_name = column_data_record.get_datum_by_ordinal(6).unwrap().value::().unwrap().unwrap(); 475 | let column_ordinal_position = column_data_record.get_datum_by_ordinal(7).unwrap().value::().unwrap().unwrap(); 476 | let column_type_name = column_data_record.get_datum_by_ordinal(8).unwrap().value::().unwrap().unwrap(); 477 | 478 | let column_id = Uuid::new_v4(); 479 | 480 | let column_data = ColumnData { 481 | id: column_id, 482 | system_id, 483 | schema_name, 484 | table_oid, 485 | table_name, 486 | column_name, 487 | column_ordinal_position, 488 | column_type_name, 489 | }; 490 | 491 | return Some(column_data) 492 | 493 | } else { 494 | log!("Column Data Not available."); 495 | } 496 | return None 497 | } 498 | Err(e) => { 499 | log!("Target Column Data Error: {:?}", e); 500 | return None 501 | } 502 | } 503 | }); 504 | 505 | descriptor.descriptor_link.target_column = column_data; 506 | } 507 | 508 | fn get_descriptor(column_name: String, column_data: ColumnData, orbit: String, is_sensitive: bool) -> Descriptor { 509 | let descriptor_link_id = Uuid::new_v4(); 510 | let descriptor_link = DescriptorLink { 511 | id: descriptor_link_id, 512 | alias: column_name, // TODO: Give the user an option to change name in the future - modality TBD. 513 | source_column: Some(column_data), 514 | target_column: None, 515 | }; 516 | let descriptor_id = Uuid::new_v4(); 517 | let descriptor = Descriptor { 518 | id: descriptor_id, 519 | descriptor_link, 520 | orbit, 521 | is_sensitive, 522 | }; 523 | 524 | descriptor 525 | } 526 | 527 | fn get_business_key_part_link(alias: String, column_data: ColumnData) -> BusinessKeyPartLink { 528 | let business_key_part_link_id = Uuid::new_v4(); 529 | let mut sources_column_data: Vec = Vec::new(); 530 | sources_column_data.push(column_data); 531 | 532 | let business_key_link = BusinessKeyPartLink { 533 | id: business_key_part_link_id, 534 | alias, 535 | source_columns: sources_column_data, 536 | hub_target_column: None, 537 | }; 538 | 539 | business_key_link 540 | } 541 | 542 | fn build_sql_from_link_key(dw_schema: &String, link_key: &LinkKey) -> String { 543 | 544 | let mut dv_link_key_ddl_sql = String::new(); 545 | 546 | let link_key_name = &link_key.name; 547 | 548 | let mut bk_name_types: Vec = Vec::new(); 549 | 550 | for bk in &link_key.business_keys { 551 | let bk_name_type = format!("hub_{}_hk VARCHAR", bk.name); 552 | bk_name_types.push(bk_name_type); 553 | } 554 | 555 | let bk_name_types_subsql = bk_name_types.join(",\n"); 556 | 557 | dv_link_key_ddl_sql += 558 | &format!(r#" 559 | CREATE TABLE IF NOT EXISTS {dw_schema}.link_{link_key_name} ( 560 | link_{link_key_name}_hk VARCHAR NOT NULL, 561 | load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL, 562 | record_source VARCHAR NOT NULL, 563 | {bk_name_types_subsql} 564 | ); 565 | "#); 566 | 567 | // Sat Buildout 568 | let mut satellite_sqls: HashMap = HashMap::new(); 569 | 570 | for descriptor in &link_key.descriptors { 571 | 572 | let sensitive_string = { 573 | if descriptor.is_sensitive == true { 574 | "_sensitive".to_string() 575 | } else { 576 | "".to_string() 577 | } 578 | }; 579 | 580 | let satellite_sql_key = descriptor.orbit.clone() + &sensitive_string; 581 | let desc_column_name = &descriptor.descriptor_link.alias; 582 | let desc_column_type = &descriptor.descriptor_link.source_column.as_ref().unwrap().column_type_name; 583 | let sat_descriptor_sql_part: String = format!(",\n {} {}", desc_column_name, desc_column_type); 584 | 585 | if let Some(existing_sat_sql) = satellite_sqls.get_mut(&satellite_sql_key) { 586 | if let Some(pos) = existing_sat_sql.find(");") { 587 | existing_sat_sql.insert_str(pos, &sat_descriptor_sql_part); 588 | } else { 589 | println!("The substring \");\" was not found in the original string."); 590 | } 591 | } else { 592 | let begin_sat_sql = 593 | format!(r#" 594 | CREATE TABLE {}.sat_{} ( 595 | link_{}_hk VARCHAR NOT NULL, 596 | load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL, 597 | record_source VARCHAR NOT NULL, 598 | sat_{}_hd VARCHAR NOT NULL{}); 599 | "#, dw_schema, satellite_sql_key, link_key.name, satellite_sql_key, sat_descriptor_sql_part); 600 | satellite_sqls.insert(satellite_sql_key, begin_sat_sql); 601 | } 602 | } 603 | 604 | for satellite_sql in satellite_sqls { 605 | dv_link_key_ddl_sql.push_str(&satellite_sql.1); 606 | } 607 | 608 | for business_key in &link_key.business_keys { 609 | dv_link_key_ddl_sql.push_str(&build_sql_from_business_key(dw_schema, business_key)); 610 | } 611 | 612 | dv_link_key_ddl_sql 613 | } 614 | 615 | fn build_sql_from_business_key(dw_schema: &String, business_key: &BusinessKey) -> String { 616 | let mut dv_business_key_ddl_sql = String::new(); 617 | 618 | // Hub Buildout 619 | let mut hub_bks = String::new(); 620 | 621 | for part_link in &business_key.business_key_part_links { 622 | let r = format!(r#", 623 | {}_bk VARCHAR"#, part_link.alias); 624 | hub_bks.push_str(&r); 625 | } 626 | 627 | let hub_sql = 628 | format!(r#" 629 | CREATE TABLE IF NOT EXISTS {}.hub_{} ( 630 | hub_{}_hk VARCHAR NOT NULL, 631 | load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL, 632 | record_source VARCHAR NOT NULL{} 633 | ); 634 | "#, dw_schema, business_key.name, business_key.name, hub_bks); 635 | 636 | dv_business_key_ddl_sql.push_str(&format!( 637 | r#" 638 | {}"#, hub_sql)); 639 | 640 | // Sat Buildout 641 | let mut satellite_sqls: HashMap = HashMap::new(); 642 | 643 | for descriptor in &business_key.descriptors { 644 | 645 | let sensitive_string = { 646 | if descriptor.is_sensitive == true { 647 | "_sensitive".to_string() 648 | } else { 649 | "".to_string() 650 | } 651 | }; 652 | 653 | let satellite_sql_key = descriptor.orbit.clone() + &sensitive_string; 654 | let desc_column_name = &descriptor.descriptor_link.alias; 655 | let desc_column_type = &descriptor.descriptor_link.source_column.as_ref().unwrap().column_type_name; 656 | let sat_descriptor_sql_part: String = format!(",\n {} {}", desc_column_name, desc_column_type); 657 | 658 | if let Some(existing_sat_sql) = satellite_sqls.get_mut(&satellite_sql_key) { 659 | if let Some(pos) = existing_sat_sql.find(");") { 660 | existing_sat_sql.insert_str(pos, &sat_descriptor_sql_part); 661 | } else { 662 | println!("The substring \");\" was not found in the original string."); 663 | } 664 | } else { 665 | let begin_sat_sql = 666 | format!(r#" 667 | CREATE TABLE {}.sat_{} ( 668 | hub_{}_hk VARCHAR NOT NULL, 669 | load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL, 670 | record_source VARCHAR NOT NULL, 671 | sat_{}_hd VARCHAR NOT NULL{}); 672 | "#, dw_schema, satellite_sql_key, business_key.name, satellite_sql_key, sat_descriptor_sql_part); 673 | satellite_sqls.insert(satellite_sql_key, begin_sat_sql); 674 | } 675 | } 676 | 677 | for satellite_sql in satellite_sqls { 678 | dv_business_key_ddl_sql.push_str(&satellite_sql.1); 679 | } 680 | 681 | dv_business_key_ddl_sql 682 | } 683 | 684 | #[derive(Debug, PartialEq)] 685 | enum ColumnCategory { 686 | BusinessKeyPart, 687 | Descriptor, 688 | DescriptorSensitive, 689 | } 690 | 691 | impl ColumnCategory { 692 | fn from_str(input: &str) -> ColumnCategory { 693 | match input { 694 | "Business Key Part" => ColumnCategory::BusinessKeyPart, 695 | "Descriptor" => ColumnCategory::Descriptor, 696 | "Descriptor - Sensitive" => ColumnCategory::DescriptorSensitive, 697 | _ => panic!("'{}' is not a valid ColumnCategory", input), 698 | } 699 | } 700 | } 701 | 702 | #[derive(Debug)] 703 | struct TransformerObject { 704 | #[allow(dead_code)] 705 | schema_name: String, 706 | table_name: String, 707 | business_key_name: String, 708 | column_name: String, 709 | column_type_name: String, 710 | system_id: i64, 711 | table_oid: Oid, 712 | column_ordinal_position: i16, 713 | column_category: ColumnCategory, 714 | } 715 | 716 | // Separates TransformerObject with multiple business key parts 717 | fn separate_by_business_parts(dv_objects_hm_single_bkp: HashMap>) -> (HashMap>, HashMap>) { 718 | 719 | let mut single_business_key_part: HashMap> = HashMap::new(); 720 | let mut multiple_business_key_parts: HashMap> = HashMap::new(); 721 | 722 | for (table_oid, transformer_objects) in dv_objects_hm_single_bkp { 723 | let business_key_count = transformer_objects.iter() 724 | .filter(|obj| matches!(obj.column_category, ColumnCategory::BusinessKeyPart)) 725 | .count(); 726 | 727 | if business_key_count > 1 { 728 | multiple_business_key_parts.insert(table_oid, transformer_objects); 729 | } else if business_key_count == 1 { 730 | single_business_key_part.insert(table_oid, transformer_objects); 731 | } 732 | } 733 | 734 | (single_business_key_part, multiple_business_key_parts) 735 | } -------------------------------------------------------------------------------- /extension/src/controller/dv_loader.rs: -------------------------------------------------------------------------------- 1 | use pgrx::prelude::*; 2 | use std::collections::HashMap; 3 | use crate::model::dv_schema::*; 4 | 5 | 6 | pub fn get_dv_schemas() -> Vec { 7 | 8 | // get DV_SCHEMAS via Query 9 | let get_schemas_query: &str = r#" 10 | SELECT schema 11 | FROM auto_dw.dv_repo; 12 | "#; 13 | 14 | // Load schemas 15 | let mut dv_schemas: Vec = Vec::new(); 16 | Spi::connect( |client| { 17 | 18 | let schema_results = client.select(get_schemas_query, None, None); 19 | match schema_results { 20 | Ok(schema_results) => { 21 | for schema_result in schema_results { 22 | let schema_json = schema_result.get_datum_by_ordinal(1).unwrap().value::().unwrap().unwrap(); 23 | let pgrx::Json(schema_json_value) = schema_json; 24 | let dv_schema: Result = serde_json::from_value(schema_json_value); 25 | 26 | match dv_schema { 27 | Ok(dv_schema) => dv_schemas.push(dv_schema), 28 | Err(e) => panic!("Failure to unwrap dv_schema, error: {e}"), 29 | } 30 | } 31 | }, 32 | Err(e) => panic!("Get Schemas Query Failure, error: {e}"), 33 | } 34 | }); 35 | 36 | dv_schemas 37 | } 38 | 39 | // Load All DV Schemas 40 | 41 | pub fn dv_load_schemas_all() -> bool { 42 | 43 | for dv_schema in get_dv_schemas() { 44 | dv_data_loader(&dv_schema); 45 | log!("DV Schema (Build ID) Loaded: {}", dv_schema.id.to_string()) 46 | } 47 | true 48 | } 49 | 50 | pub fn dv_load_schema_from_build_id(build_id: &String) -> Option { 51 | let get_schema_query: &str = r#" 52 | SELECT schema 53 | FROM auto_dw.dv_repo 54 | WHERE build_id = $1 55 | "#; 56 | 57 | // Variable to store the result 58 | let mut schema_result: Option = None; 59 | 60 | // Load Schema w/ Build ID 61 | Spi::connect( |client| { 62 | let results = client.select(get_schema_query, None, 63 | Some(vec![ 64 | (PgOid::from(pg_sys::TEXTOID), build_id.into_datum()), 65 | ])); 66 | 67 | match results { 68 | Ok(results) => { 69 | if let Some(result) = results.into_iter().next() { 70 | let schema_json = result.get_datum_by_ordinal(1).unwrap().value::().unwrap().unwrap(); 71 | let deserialized_schema: Result = serde_json::from_value(schema_json.0); 72 | match deserialized_schema { 73 | Ok(deserialized_schema) => { 74 | schema_result = Some(deserialized_schema); 75 | }, 76 | Err(_) => { 77 | log!("Schema could not be deserialized"); 78 | }, 79 | } 80 | } 81 | }, 82 | Err(_) => { 83 | log!("Schema could not deserialized"); 84 | }, 85 | } 86 | 87 | }); 88 | return schema_result; 89 | } 90 | 91 | // Refreshes based on dv_schema 92 | pub fn dv_data_loader(dv_schema: &DVSchema) { 93 | 94 | // Create DML: Link Load SQL for Link Objects in DV Schema 95 | let link_dmls = create_dv_link_dml_for_lks(dv_schema); 96 | 97 | // Create DML: Hub Load SQL for BuskinesKey and LinkKey Objects in DV Schema 98 | let hub_dmls = create_dv_hub_dml_for_bks(dv_schema) + &create_dv_hub_dml_for_lks(dv_schema); 99 | 100 | // Create DML: Satellite Load SQL for BK Objects in DV Schema 101 | let sat_dmls = create_dv_sat_dml_for_bks_descriptors(dv_schema) + &create_dv_sat_dml_for_lks_descriptors(dv_schema); 102 | 103 | // Run SQL 104 | let dv_dmls = link_dmls + &hub_dmls + &sat_dmls; 105 | 106 | log!("DML: {dv_dmls}"); 107 | 108 | // Build Tables using DDL 109 | Spi::connect( |mut client| { 110 | // client.select(dv_objects_query, None, None); 111 | _ = client.update(&dv_dmls, None, None); 112 | log!("Data Pushed to DV tables."); 113 | } 114 | ); 115 | 116 | } 117 | 118 | fn create_dv_link_dml_for_lks(dv_schema: &DVSchema) -> String { 119 | let mut link_insert_dmls = String::new(); 120 | let dw_schema = &dv_schema.dw_schema; 121 | 122 | for link_key in &dv_schema.link_keys { 123 | let mut link_bk_source_parts_name: Vec = Vec::new(); 124 | // let mut hub_hash_sqls = String::new(); 125 | let mut hub_name_hks: Vec = Vec::new(); 126 | let mut hub_hash_sqls: Vec = Vec::new(); 127 | for business_key in &link_key.business_keys { 128 | // Array Parts 129 | let bk_parts: Vec = 130 | business_key.business_key_part_links 131 | .iter() 132 | .map(|part_link| format!("{}", part_link.source_columns[0].column_name)) 133 | .collect(); 134 | 135 | let bk_source_parts_joined = bk_parts.join("::TEXT,") + "::TEXT"; 136 | let business_key_name = &business_key.name; 137 | 138 | hub_hash_sqls.push(format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{bk_source_parts_joined}], ',')) AS hub_{business_key_name}_hk")); 139 | link_bk_source_parts_name.push(bk_source_parts_joined); 140 | 141 | // For Insert 142 | hub_name_hks.push(format!("hub_{business_key_name}_hk")); 143 | } 144 | 145 | let link_key_name = &link_key.name; 146 | let link_hk_parts = &(link_bk_source_parts_name.join("::TEXT, ") + "::TEXT"); 147 | let link_hk_sql = 148 | format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{link_hk_parts}], ',')) AS link_{link_key_name}_hk,"); 149 | let timestamp_sql = format!("(CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts,"); 150 | let record_source = 151 | link_key.business_keys[0].business_key_part_links[0].source_columns[0].system_id.to_string() + ":" + 152 | &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name; 153 | let record_source_sql = format!("'{record_source}' AS record_source,"); 154 | let hubs_hash_sql = &hub_hash_sqls.join(", \n"); 155 | let schema_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name; 156 | let source_table_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].table_name; 157 | let source_schema_table_sql = format!("{schema_name}.{source_table_name}"); 158 | 159 | let hub_name_hks_sql = hub_name_hks.join(",\n"); 160 | 161 | 162 | let insert_sql = format!( 163 | r#" 164 | INSERT INTO {dw_schema}.link_{link_key_name} ( 165 | link_{link_key_name}_hk, 166 | load_ts, 167 | record_source, 168 | {hub_name_hks_sql} 169 | )"# 170 | ); 171 | 172 | let source_cte_sql = 173 | format!( 174 | r#" 175 | WITH 176 | stg_data AS ( 177 | SELECT 178 | {link_hk_sql} 179 | {timestamp_sql} 180 | {record_source_sql} 181 | {hubs_hash_sql} 182 | FROM {source_schema_table_sql} 183 | ), 184 | "# 185 | ); 186 | 187 | let new_source_cte_sql = 188 | format!( 189 | r#" 190 | new_stg_data AS ( 191 | SELECT stg_data.* FROM stg_data 192 | LEFT JOIN {dw_schema}.link_{link_key_name} ON stg_data.link_{link_key_name}_hk = link_{link_key_name}.link_{link_key_name}_hk 193 | WHERE link_{link_key_name}.link_{link_key_name}_hk IS NULL 194 | ) 195 | "# 196 | ); 197 | 198 | let select_sql = format!( 199 | r#" 200 | SELECT 201 | link_{link_key_name}_hk, 202 | load_ts, 203 | record_source, 204 | {hub_name_hks_sql} 205 | FROM new_stg_data; 206 | "# 207 | ); 208 | 209 | let sql_for_link_insert = insert_sql + &source_cte_sql + &new_source_cte_sql + &select_sql; 210 | link_insert_dmls.push_str(&sql_for_link_insert); 211 | } 212 | 213 | link_insert_dmls 214 | } 215 | 216 | fn create_dv_hub_dml_for_lks(dv_schema: &DVSchema) -> String { 217 | 218 | let mut link_insert_dmls = String::new(); 219 | 220 | for link_key in &dv_schema.link_keys { 221 | 222 | for business_key in &link_key.business_keys { 223 | let hub_dml = business_key_to_hub_dml(business_key, &dv_schema.dw_schema); 224 | link_insert_dmls.push_str(&hub_dml); 225 | } 226 | 227 | } 228 | 229 | link_insert_dmls 230 | } 231 | 232 | fn create_dv_hub_dml_for_bks (dv_schema: &DVSchema) -> String { 233 | 234 | let mut hub_insert_dmls = String::new(); 235 | 236 | for business_key in &dv_schema.business_keys { 237 | let hub_dml = business_key_to_hub_dml(business_key, &dv_schema.dw_schema); 238 | hub_insert_dmls.push_str(&hub_dml); 239 | } 240 | 241 | hub_insert_dmls 242 | } 243 | 244 | fn create_dv_sat_dml_for_lks_descriptors (dv_schema: &DVSchema) -> String { 245 | let mut sat_link_insert_dmls = String::new(); 246 | let dw_schema = &dv_schema.dw_schema; 247 | 248 | for link_key in &dv_schema.link_keys { 249 | let mut link_bk_source_parts_name: Vec = Vec::new(); 250 | let mut hub_hash_sqls: Vec = Vec::new(); 251 | for business_key in &link_key.business_keys { 252 | // Array Parts 253 | let bk_parts: Vec = 254 | business_key.business_key_part_links 255 | .iter() 256 | .map(|part_link| format!("{}", part_link.source_columns[0].column_name)) 257 | .collect(); 258 | 259 | let bk_source_parts_joined = bk_parts.join("::TEXT,") + "::TEXT"; 260 | let business_key_name = &business_key.name; 261 | 262 | hub_hash_sqls.push(format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{bk_source_parts_joined}], ',')) AS hub_{business_key_name}_hk")); 263 | link_bk_source_parts_name.push(bk_source_parts_joined); 264 | } 265 | 266 | let link_key_name = &link_key.name; 267 | let link_hk_parts = &(link_bk_source_parts_name.join("::TEXT, ") + "::TEXT"); 268 | let link_hk_sql = 269 | format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{link_hk_parts}], ',')) AS link_{link_key_name}_hk,"); 270 | let timestamp_sql = format!("(CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts,"); 271 | let record_source = 272 | link_key.business_keys[0].business_key_part_links[0].source_columns[0].system_id.to_string() + ":" + 273 | &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name; 274 | let record_source_sql = format!("'{record_source}' AS record_source,"); 275 | let schema_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name; 276 | let source_table_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].table_name; 277 | let source_schema_table_sql = format!("{schema_name}.{source_table_name}"); 278 | 279 | // Closure to load Sensititive and Non-Sensitive Satellites 280 | let mut insert_link_sat_type = |type_string: &str, descriptors: Vec<&Descriptor>| { 281 | 282 | let desritpors_name: Vec<&str> = 283 | descriptors 284 | .iter().map(|descriptor| { 285 | descriptor 286 | .descriptor_link 287 | .source_column.as_ref() 288 | .expect("Expected source_column to be Some, but found None") 289 | .column_name 290 | .as_str() 291 | }).collect(); 292 | 293 | let descriptors_names_comma_seperated = desritpors_name.join(", "); 294 | 295 | let insert_sql = 296 | format!( 297 | r#" 298 | INSERT INTO {dw_schema}.sat_{source_table_name}{type_string} ( 299 | link_{link_key_name}_hk, 300 | load_ts, 301 | record_source, 302 | sat_{source_table_name}{type_string}_hd, 303 | {descriptors_names_comma_seperated} 304 | )"# 305 | ); 306 | 307 | let descriptors_names_text_comma_seperated = desritpors_name.join("::TEXT, ") + "::TEXT"; 308 | let descriptors_hd_sql = 309 | format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{descriptors_names_text_comma_seperated}], ',')) AS sat_{source_table_name}{type_string}_hd,"); 310 | 311 | let source_cte_sql = 312 | format!( 313 | r#" 314 | WITH 315 | stg_data AS ( 316 | SELECT 317 | {link_hk_sql} 318 | {timestamp_sql} 319 | {record_source_sql} 320 | {descriptors_hd_sql} 321 | {descriptors_names_comma_seperated} 322 | FROM {source_schema_table_sql} 323 | ), 324 | "# 325 | ); 326 | 327 | let new_source_cte_sql = 328 | format!( 329 | r#" 330 | new_stg_data AS ( 331 | SELECT stg_data.* FROM stg_data 332 | LEFT JOIN {dw_schema}.sat_{source_table_name}{type_string} ON stg_data.link_{link_key_name}_hk = sat_{source_table_name}{type_string}.link_{link_key_name}_hk 333 | AND stg_data.sat_{source_table_name}{type_string}_hd = sat_{source_table_name}{type_string}.sat_{source_table_name}{type_string}_hd 334 | WHERE sat_{source_table_name}{type_string}.link_{link_key_name}_hk IS NULL 335 | ) 336 | "# 337 | ); 338 | 339 | let select_sql = 340 | format!( 341 | r#" 342 | SELECT 343 | link_{link_key_name}_hk, 344 | load_ts, 345 | record_source, 346 | sat_{source_table_name}{type_string}_hd, 347 | {descriptors_names_comma_seperated} 348 | FROM new_stg_data; 349 | "# 350 | ); 351 | 352 | let sql_for_link_insert = insert_sql + &source_cte_sql + &new_source_cte_sql + &select_sql; 353 | sat_link_insert_dmls.push_str(&sql_for_link_insert); 354 | }; 355 | 356 | let (descriptors_sensitive, descriptors_not_sensitive): (Vec<&Descriptor>, Vec<&Descriptor>) = 357 | link_key.descriptors 358 | .iter() 359 | .partition(|descriptor| descriptor.is_sensitive); 360 | 361 | // INSERT for Sensitive Descriptors 362 | let has_sensitive_descriptors = !descriptors_sensitive.is_empty(); 363 | if has_sensitive_descriptors { 364 | insert_link_sat_type("_sensitive", descriptors_sensitive); 365 | } 366 | 367 | // INSERT for "Standard" (Non-Sensitive) Descriptors 368 | let has_descriptors = !descriptors_not_sensitive.is_empty(); 369 | if has_descriptors { 370 | insert_link_sat_type("", descriptors_not_sensitive); 371 | } 372 | 373 | } 374 | 375 | sat_link_insert_dmls 376 | } 377 | 378 | fn create_dv_sat_dml_for_bks_descriptors (dv_schema: &DVSchema) -> String { 379 | 380 | let mut sat_insert_dmls = String::new(); 381 | let dw_schema = dv_schema.dw_schema.clone(); 382 | 383 | for business_key in &dv_schema.business_keys { 384 | 385 | // Arrary Parts 386 | let mut hub_bk_parts_sql_stg_array = String ::new(); 387 | for part_link in &business_key.business_key_part_links { 388 | // TODO: Need acount for more than once source. However, Vec data structure isn't ideal - refactor. 389 | let e = format!(r#"stg.{}::TEXT,"#, part_link.source_columns[0].column_name); 390 | hub_bk_parts_sql_stg_array.push_str(&e); 391 | } 392 | hub_bk_parts_sql_stg_array.pop(); // Removing the last "," 393 | 394 | // Sat Buildout 395 | let mut sat_insert_sql_header_parts: HashMap = HashMap::new(); 396 | let mut descriptors_for_sats: HashMap> = HashMap::new(); 397 | 398 | for descriptor in &business_key.descriptors { 399 | 400 | let sensitive_string = { 401 | if descriptor.is_sensitive == true { 402 | "_sensitive".to_string() 403 | } else { 404 | "".to_string() 405 | } 406 | }; 407 | 408 | let satellite_sql_key = descriptor.orbit.clone() + &sensitive_string; 409 | 410 | descriptors_for_sats 411 | .entry(satellite_sql_key.clone()) 412 | .or_insert_with(Vec::new) 413 | .push(&descriptor); 414 | 415 | let desc_column_name = &descriptor.descriptor_link.alias; 416 | 417 | // SAT INSERT Header 418 | let sat_descriptor_sql_part: String = format!(",\n {}", desc_column_name); 419 | if let Some(existing_sat_sql) = sat_insert_sql_header_parts.get_mut(&satellite_sql_key) { 420 | if let Some(pos) = existing_sat_sql.find(")") { 421 | existing_sat_sql.insert_str(pos, &sat_descriptor_sql_part); 422 | } else { 423 | println!("The substring \")\" was not found in the original string."); 424 | } 425 | } else { 426 | let begin_sat_sql = 427 | format!(r#" 428 | INSERT INTO {}.sat_{} ( 429 | hub_{}_hk, 430 | load_ts, 431 | record_source, 432 | sat_{}_hd{}) 433 | "#, 434 | dw_schema, &satellite_sql_key, 435 | business_key.name, 436 | &satellite_sql_key, sat_descriptor_sql_part); 437 | 438 | sat_insert_sql_header_parts.insert(satellite_sql_key.clone(), begin_sat_sql); 439 | } 440 | } 441 | 442 | // Array SQL 443 | let mut sats_source_sql_array: HashMap = HashMap::new(); 444 | for (key, descriptors) in descriptors_for_sats.clone() { 445 | let array_part_str = sats_source_sql_array.entry(key.clone()).or_insert_with(String::new); 446 | 447 | for descriptor in descriptors { 448 | if let Some(column) = descriptor.descriptor_link.source_column.as_ref() { 449 | let array_part = if array_part_str.is_empty() { 450 | format!("stg.{}::TEXT", column.column_name) 451 | } else { 452 | format!(", stg.{}::TEXT", column.column_name) 453 | }; 454 | array_part_str.push_str(&array_part); 455 | } 456 | } 457 | } 458 | 459 | // Column SQL 460 | let mut sats_source_sql_cols: HashMap = HashMap::new(); 461 | for (key, descriptors) in descriptors_for_sats.clone() { 462 | let col_part_str = sats_source_sql_cols.entry(key.clone()).or_insert_with(String::new); 463 | 464 | for descriptor in descriptors { 465 | if let Some(column) = descriptor.descriptor_link.source_column.as_ref() { 466 | let col_part = format!(r#", 467 | {}"#, 468 | column.column_name); 469 | col_part_str.push_str(&col_part); 470 | } 471 | } 472 | } 473 | 474 | // Main Insert 475 | 476 | for (key, insert_header) in sat_insert_sql_header_parts { 477 | 478 | let sat_source_sql_array = sats_source_sql_array.get(&key).map(|v| v.as_str()).unwrap_or("NA"); 479 | let sat_source_sql_cols = sats_source_sql_cols.get(&key).map(|v| v.as_str()).unwrap_or("NA"); 480 | 481 | // TODO: Change data structure to support multiple source schemas. 482 | let source_schema_name = descriptors_for_sats 483 | .get(&key) 484 | .and_then(|v| v.get(0)) // Safely get the first element 485 | .and_then(|descriptor| descriptor.descriptor_link.source_column.as_ref()) // Safely access target_column 486 | .map(|source_column| source_column.schema_name.clone()) // Safely get schema_name and clone it 487 | .unwrap_or_default(); // Provide a default value in case of None 488 | 489 | let source_table_name = descriptors_for_sats 490 | .get(&key) 491 | .and_then(|v| v.get(0)) // Safely get the first element 492 | .and_then(|descriptor| descriptor.descriptor_link.source_column.as_ref()) // Safely access target_column 493 | .map(|source_column| source_column.table_name.clone()) // Safely get schema_name and clone it 494 | .unwrap_or_default(); // Provide a default value in case of None 495 | 496 | let business_key_name = &business_key.name; 497 | 498 | let insert_sql = format!(r#" 499 | -- SAT INSERT SQL 500 | {insert_header} 501 | WITH stg AS ( 502 | SELECT 503 | *, 504 | auto_dw.hash( 505 | ARRAY_TO_STRING(ARRAY[{hub_bk_parts_sql_stg_array}], ',') 506 | ) AS hub_{business_key_name}_hk, 507 | auto_dw.hash( 508 | ARRAY_TO_STRING(ARRAY[{sat_source_sql_array}], ',') 509 | ) AS sat_{key}_hd 510 | FROM {source_schema_name}.{source_table_name} AS stg 511 | ), 512 | new_stg_data AS ( 513 | SELECT stg.* 514 | FROM stg 515 | LEFT JOIN {dw_schema}.sat_{key} ON 516 | stg.hub_{business_key_name}_hk = sat_{key}.hub_{business_key_name}_hk AND 517 | stg.sat_{key}_hd = sat_{key}.sat_{key}_hd 518 | WHERE sat_{key}.hub_{business_key_name}_hk IS NULL 519 | ) 520 | SELECT 521 | hub_{business_key_name}_hk, 522 | (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP WITHOUT TIME ZONE AS load_ts , 523 | '{source_schema_name}' AS record_source , 524 | sat_{key}_hd 525 | {sat_source_sql_cols} 526 | FROM new_stg_data 527 | ; 528 | "#); 529 | 530 | sat_insert_dmls.push_str(&insert_sql); 531 | } 532 | } 533 | 534 | sat_insert_dmls 535 | } 536 | 537 | fn business_key_to_hub_dml(business_key: &BusinessKey, dw_schema_name: &String) -> String { 538 | let mut hub_insert_dml = String::new(); 539 | 540 | // Hub Buildout 541 | let busines_key_name = &business_key.name; 542 | 543 | // Business Key Part(s) 544 | let mut hub_bk_parts_sql = String::new(); 545 | for part_link in &business_key.business_key_part_links { 546 | let r = format!(r#", 547 | {}_bk"#, part_link.alias); 548 | hub_bk_parts_sql.push_str(&r); 549 | } 550 | 551 | // INSERT INTO Header 552 | let hub_insert_into_header_part_sql = format!(r#" 553 | INSERT INTO {}.hub_{} ( 554 | hub_{}_hk, 555 | load_ts, 556 | record_source 557 | {} 558 | ) 559 | "#, 560 | dw_schema_name, busines_key_name, busines_key_name, hub_bk_parts_sql); 561 | 562 | 563 | // Business Key Part(s) Init SQL 564 | let mut hub_bk_neg_1_init_parts_sql = String::new(); 565 | let mut hub_bk_neg_2_init_parts_sql = String::new(); 566 | for part_link in &business_key.business_key_part_links { 567 | let neg_1: String = format!(r#", 568 | '-1'::TEXT AS {}_bk"#, part_link.alias); 569 | hub_bk_neg_1_init_parts_sql.push_str(&neg_1); 570 | let neg_2: String = format!(r#", 571 | '-2'::TEXT AS {}_bk"#, part_link.alias); 572 | hub_bk_neg_2_init_parts_sql.push_str(&neg_2); 573 | } 574 | 575 | let hub_insert_into_init_part_sql = format!(r#" 576 | WITH initialized AS ( 577 | SELECT 578 | CASE 579 | WHEN COUNT(*) > 0 THEN TRUE 580 | ELSE FALSE 581 | END is_initialized 582 | FROM {dw_schema_name}.hub_{busines_key_name} 583 | ) 584 | SELECT 585 | auto_dw.hash(ARRAY_TO_STRING(ARRAY[-1], ',')::TEXT) AS hub_{busines_key_name}_hk, 586 | '0001-01-01'::TIMESTAMP WITHOUT TIME ZONE AS load_ts, 587 | 'SYSTEM'::TEXT AS record_source 588 | {hub_bk_neg_1_init_parts_sql} 589 | FROM initialized WHERE NOT initialized.is_initialized 590 | UNION 591 | SELECT 592 | auto_dw.hash(ARRAY_TO_STRING(ARRAY[-2], ',')::TEXT) AS hub_{busines_key_name}_hk, 593 | '0001-01-01'::TIMESTAMP WITHOUT TIME ZONE AS load_ts, 594 | 'SYSTEM'::TEXT AS record_source 595 | {hub_bk_neg_2_init_parts_sql} 596 | FROM initialized WHERE NOT initialized.is_initialized 597 | ; 598 | "#); 599 | 600 | let hub_insert_init = hub_insert_into_header_part_sql.clone() + &hub_insert_into_init_part_sql; 601 | hub_insert_dml.push_str(&hub_insert_init); 602 | 603 | // Insert Main 604 | 605 | // Arrary Parts 606 | let mut hub_bk_parts_sql_stg_array = String ::new(); 607 | for part_link in &business_key.business_key_part_links { 608 | // TODO: Need acount for more than once source. However, Vec data structure isn't ideal - refactor. 609 | let e = format!(r#"stg.{}::TEXT,"#, part_link.source_columns[0].column_name); 610 | hub_bk_parts_sql_stg_array.push_str(&e); 611 | } 612 | hub_bk_parts_sql_stg_array.pop(); // Removing the last "," 613 | 614 | // Source Schema 615 | let mut source_schema = String::new(); 616 | let mut source_table = String::new(); 617 | 618 | // Business Key Part(s) 619 | let mut hub_bk_parts_stg_names = String::new(); 620 | for part_link in &business_key.business_key_part_links { 621 | let source_column_name = &part_link.source_columns[0].column_name; 622 | let source_column_alias = &part_link.alias; 623 | let e = format!(r#",stg.{source_column_name}::TEXT AS {source_column_alias}_bk"#); 624 | hub_bk_parts_stg_names.push_str(&e); 625 | source_schema = part_link.source_columns[0].schema_name.clone(); 626 | source_table = part_link.source_columns[0].table_name.clone(); 627 | } 628 | 629 | let hub_insert_into_main_part_sql = format!(r#" 630 | WITH 631 | stg_data AS ( 632 | SELECT 633 | auto_dw.hash( 634 | ARRAY_TO_STRING(ARRAY[{hub_bk_parts_sql_stg_array}], ',') 635 | ) AS hub_{busines_key_name}_hk, 636 | (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts, 637 | '{source_schema}' AS record_source 638 | {hub_bk_parts_stg_names} 639 | FROM {source_schema}.{source_table} AS stg 640 | ), 641 | new_stg_data AS ( 642 | SELECT stg_data.* FROM stg_data 643 | LEFT JOIN {dw_schema_name}.hub_{busines_key_name} ON stg_data.hub_{busines_key_name}_hk = hub_{busines_key_name}.hub_{busines_key_name}_hk 644 | WHERE hub_{busines_key_name}.hub_{busines_key_name}_hk IS NULL 645 | ) 646 | SELECT 647 | hub_{busines_key_name}_hk, 648 | load_ts, 649 | record_source{hub_bk_parts_sql} 650 | FROM new_stg_data 651 | ; 652 | "# 653 | ); 654 | 655 | let hub_insert_main = hub_insert_into_header_part_sql + &hub_insert_into_main_part_sql; 656 | hub_insert_dml.push_str(&hub_insert_main); 657 | 658 | hub_insert_dml 659 | } 660 | -------------------------------------------------------------------------------- /extension/src/controller/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod bgw_init; 2 | pub mod bgw_source_objects; 3 | pub mod bgw_transformer_client; 4 | pub mod dv_builder; 5 | pub mod dv_loader; -------------------------------------------------------------------------------- /extension/src/lib.rs: -------------------------------------------------------------------------------- 1 | mod controller; // Coordinates application logic and model-service interactions. 2 | mod model; // Defines data structures and data-related methods. 3 | mod utility; // Initialization, Configuration Management, and External Services 4 | 5 | use controller::dv_loader; 6 | pub use pgrx::prelude::*; 7 | use utility::guc; 8 | use uuid::Uuid; 9 | 10 | use sha2::{Sha256, Digest}; 11 | use hex; 12 | 13 | pgrx::pg_module_magic!(); 14 | 15 | use model::queries; 16 | 17 | #[pg_extern(name="go")] 18 | fn go_default() -> String { 19 | let accepted_transformer_confidence_level: String = 20 | utility::guc::get_guc(guc::PgAutoDWGuc::AcceptedTransformerConfidenceLevel) 21 | .unwrap_or_else(|| { 22 | error!("GUC: Unable to obtain parameter \"pg_auto_dw.accepted_transformer_confidence_level.\""); 23 | }); 24 | let build_id = Uuid::new_v4(); 25 | let message = format!("Build ID: {} | Data warehouse tables are currently being built.", build_id); 26 | info!("{}", message); 27 | let build_flag = "Build"; 28 | let build_status = "RTD"; 29 | let status = "Ready to Deploy"; 30 | let query_insert = &queries::insert_into_build_call( 31 | &build_id.to_string(), &build_flag, &build_status, &status, &accepted_transformer_confidence_level); 32 | _ = Spi::run(query_insert); 33 | let query_build_pull = &queries::build_object_pull(&build_id.to_string()); 34 | let load_data = true; 35 | controller::dv_builder::build_dv(build_id, query_build_pull, load_data); 36 | message 37 | } 38 | 39 | #[pg_extern(name="build")] 40 | fn build_default() -> String { 41 | let accepted_transformer_confidence_level: String = 42 | utility::guc::get_guc(guc::PgAutoDWGuc::AcceptedTransformerConfidenceLevel) 43 | .unwrap_or_else(|| { 44 | error!("GUC: Unable to obtain parameter \"pg_auto_dw.accepted_transformer_confidence_level.\""); 45 | }); 46 | let build_id = Uuid::new_v4(); 47 | let message = format!("Build ID: {} | Data warehouse tables are currently being built.", build_id); 48 | info!("{}", message); 49 | let build_flag = "Build"; 50 | let build_status = "RTD"; 51 | let status = "Ready to Deploy"; 52 | let query_insert = &queries::insert_into_build_call( 53 | &build_id.to_string(), &build_flag, &build_status, &status, &accepted_transformer_confidence_level); 54 | _ = Spi::run(query_insert); 55 | let query_build_pull = &queries::build_object_pull(&build_id.to_string()); 56 | let load_data = false; 57 | controller::dv_builder::build_dv(build_id, query_build_pull, load_data); 58 | message 59 | } 60 | 61 | // Syncing All DV Schemas 62 | #[pg_extern(name="sync")] 63 | fn sync_default() -> String { 64 | let load_complete = dv_loader::dv_load_schemas_all(); 65 | if load_complete { 66 | "All DV schema objects updated.".to_string() 67 | } else { 68 | "Failed Load".to_string() 69 | } 70 | } 71 | 72 | #[pg_extern] 73 | fn source_include( schema_pattern_include: &str, 74 | table_pattern_include: default!(Option<&str>, "NULL"), 75 | column_pattern_include: default!(Option<&str>, "NULL")) -> &'static str { 76 | // Include Patterns 77 | let schema_pattern_include: &str = schema_pattern_include; 78 | let table_pattern_include: &str = table_pattern_include.unwrap_or(".*"); 79 | let column_pattern_include: &str = column_pattern_include.unwrap_or(".*"); 80 | // Exclude Patterns 81 | let schema_pattern_exclude: &str = "a^"; 82 | let table_pattern_exclude: &str = "a^"; 83 | let column_pattern_exclude: &str = "a^"; 84 | _ = Spi::run(queries::source_object_dw( schema_pattern_include, 85 | table_pattern_include, 86 | column_pattern_include, 87 | schema_pattern_exclude, 88 | table_pattern_exclude, 89 | column_pattern_exclude) 90 | .as_str()); 91 | "Pattern Included" 92 | } 93 | 94 | #[pg_extern] 95 | fn source_exclude( schema_pattern_exclude: &str, 96 | table_pattern_exclude: default!(Option<&str>, "NULL"), 97 | column_pattern_exclude: default!(Option<&str>, "NULL")) -> &'static str { 98 | let schema_pattern_include: &str = "a^"; 99 | let table_pattern_include: &str = "a^"; 100 | let column_pattern_include: &str = "a^"; 101 | let schema_pattern_exclude: &str = schema_pattern_exclude; 102 | let table_pattern_exclude: &str = table_pattern_exclude.unwrap_or(".*"); 103 | let column_pattern_exclude: &str = column_pattern_exclude.unwrap_or(".*"); 104 | _ = Spi::run(queries::source_object_dw( schema_pattern_include, 105 | table_pattern_include, 106 | column_pattern_include, 107 | schema_pattern_exclude, 108 | table_pattern_exclude, 109 | column_pattern_exclude) 110 | .as_str()); 111 | "Pattern Excluded" 112 | } 113 | 114 | #[pg_extern] 115 | fn source_column() -> Result< 116 | TableIterator< 117 | 'static, 118 | ( 119 | name!(schema, Option), 120 | name!(table, Option), 121 | name!(column, Option), 122 | name!(status, Option), 123 | name!(category, Option), 124 | name!(is_sensitive, Option), 125 | name!(confidence_level, Option), 126 | name!(status_response, Option), 127 | ) 128 | >, 129 | spi::Error, 130 | > { 131 | let accepted_transformer_confidence_level: String = 132 | utility::guc::get_guc(guc::PgAutoDWGuc::AcceptedTransformerConfidenceLevel) 133 | .unwrap_or_else(|| { 134 | error!("GUC: Unable to obtain parameter \"pg_auto_dw.accepted_transformer_confidence_level.\""); 135 | }); 136 | 137 | let query: &str = &queries::source_column(&accepted_transformer_confidence_level); 138 | 139 | info!("Evaluation of TABLE customer"); 140 | Spi::connect(|client| { 141 | Ok(client 142 | .select(query, None, None)? 143 | .map(|row| { 144 | ( 145 | row["schema"].value().ok().flatten(), 146 | row["table"].value().ok().flatten(), 147 | row["column"].value().ok().flatten(), 148 | row["status"].value().ok().flatten(), 149 | row["category"].value().ok().flatten(), 150 | row["is_sensitive"].value().ok().flatten(), 151 | row["confidence_level"].value().ok().flatten(), 152 | row["status_response"].value().ok().flatten(), 153 | ) 154 | }) 155 | .collect::>()) 156 | }) 157 | .map(TableIterator::new) 158 | } 159 | 160 | #[pg_extern(immutable, parallel_safe)] 161 | fn hash(input: &str) -> String { 162 | let digest = Sha256::digest(input.as_bytes()); 163 | hex::encode(digest) 164 | } 165 | 166 | #[cfg(any(test, feature = "pg_test"))] 167 | #[pg_schema] 168 | mod tests { 169 | use pgrx::prelude::*; 170 | 171 | // TODO: Unit Testing 172 | #[pg_test] 173 | fn test_go_default() { 174 | } 175 | 176 | } 177 | 178 | /// This module is required by `cargo pgrx test` invocations. 179 | /// It must be visible at the root of your extension crate. 180 | #[cfg(test)] 181 | pub mod pg_test { 182 | pub fn setup(_options: Vec<&str>) { 183 | // perform one-off initialization when the pg_test framework starts 184 | } 185 | 186 | pub fn postgresql_conf_options() -> Vec<&'static str> { 187 | // return any postgresql.conf settings that are required for your tests 188 | vec![] 189 | } 190 | } 191 | -------------------------------------------------------------------------------- /extension/src/model/dv_schema.rs: -------------------------------------------------------------------------------- 1 | use chrono::NaiveDateTime; 2 | use pgrx::pg_sys::Oid; 3 | use serde::{Deserialize, Serialize}; 4 | use uuid::Uuid; 5 | 6 | #[derive(Serialize, Deserialize, Debug)] 7 | pub struct DVSchema { 8 | #[serde(rename = "ID")] 9 | pub id: Uuid, 10 | #[serde(rename = "DW Schema")] 11 | pub dw_schema: String, 12 | #[serde(rename = "Create Date")] 13 | pub create_timestamp_gmt: NaiveDateTime, 14 | #[serde(rename = "Modified Date")] 15 | pub modified_timestamp_gmt: NaiveDateTime, 16 | #[serde(rename = "Business Keys")] 17 | pub business_keys: Vec, 18 | #[serde(rename = "Link Keys")] 19 | pub link_keys: Vec, 20 | } 21 | 22 | impl DVSchema { 23 | pub fn get_columns(&self) -> Vec<(Oid, i16)> { 24 | let mut columns: Vec<(Oid, i16)> = Vec::new(); 25 | for link_key in &self.link_keys { 26 | for business_key in &link_key.business_keys { 27 | columns.append(&mut business_key.get_columns()); 28 | } 29 | 30 | for descriptor in &link_key.descriptors { 31 | if let Some(source_column) = &descriptor.descriptor_link.source_column { 32 | columns.push(source_column.get_column()); 33 | } 34 | } 35 | } 36 | 37 | for business_key in &self.business_keys { 38 | columns.append(&mut business_key.get_columns()); 39 | } 40 | columns 41 | } 42 | } 43 | 44 | #[derive(Serialize, Deserialize, Debug)] 45 | pub struct LinkKey { 46 | #[serde(rename = "ID")] 47 | pub id: Uuid, 48 | #[serde(rename = "Name")] 49 | pub name: String, 50 | #[serde(rename = "Business Keys")] 51 | pub business_keys: Vec, 52 | #[serde(rename = "Descriptors")] 53 | pub descriptors: Vec, // Commonly multiple descriptor values, but may also contain none 54 | } 55 | 56 | #[derive(Serialize, Deserialize, Debug)] 57 | pub struct BusinessKey { 58 | #[serde(rename = "ID")] 59 | pub id: Uuid, 60 | #[serde(rename = "Name")] 61 | pub name: String, 62 | #[serde(rename = "Business Key Part Links")] 63 | pub business_key_part_links: Vec, 64 | #[serde(rename = "Descriptors")] 65 | pub descriptors: Vec, // Commonly multiple descriptor values, but may also contain none 66 | } 67 | 68 | #[derive(Serialize, Deserialize, Debug)] 69 | pub struct BusinessKeyPartLink { 70 | #[serde(rename = "ID")] 71 | pub id: Uuid, 72 | #[serde(rename = "Alias")] 73 | pub alias: String, 74 | #[serde(rename = "Source Column Data")] 75 | pub source_columns: Vec, 76 | #[serde(rename = "Hub Target Column Data")] 77 | pub hub_target_column: Option, 78 | } 79 | 80 | impl BusinessKey { 81 | pub fn get_columns(&self) -> Vec<(Oid, i16)> { 82 | let mut columns: Vec<(Oid, i16)> = Vec::new(); 83 | // BK Part Search 84 | for bkp_link in &self.business_key_part_links { 85 | for source_column in &bkp_link.source_columns { 86 | columns.push(source_column.get_column()); 87 | } 88 | } 89 | // Descriptor Search 90 | for descriptor in &self.descriptors { 91 | if let Some(source_column) = &descriptor.descriptor_link.source_column { 92 | columns.push(source_column.get_column()); 93 | } 94 | } 95 | columns 96 | } 97 | } 98 | 99 | #[derive(Serialize, Deserialize, Debug)] 100 | pub struct Descriptor { 101 | #[serde(rename = "ID")] 102 | pub id: Uuid, 103 | #[serde(rename = "Descriptor Link")] 104 | pub descriptor_link: DescriptorLink, 105 | #[serde(rename = "Orbit")] 106 | pub orbit: String, 107 | #[serde(rename = "Is Sensitive")] 108 | pub is_sensitive: bool, 109 | } 110 | 111 | #[derive(Serialize, Deserialize, Debug)] 112 | pub struct DescriptorLink { 113 | #[serde(rename = "ID")] 114 | pub id: Uuid, 115 | #[serde(rename = "Alias")] 116 | pub alias: String, 117 | #[serde(rename = "Source Column Data")] 118 | pub source_column: Option, 119 | #[serde(rename = "Target Column Data")] 120 | pub target_column: Option, 121 | } 122 | 123 | #[derive(Serialize, Deserialize, Debug)] 124 | pub struct ColumnData { 125 | #[serde(rename = "ID")] 126 | pub id: Uuid, 127 | #[serde(rename = "System ID")] 128 | pub system_id: i64, 129 | #[serde(rename = "Schema Name")] 130 | pub schema_name: String, 131 | #[serde(rename = "Table OID")] 132 | pub table_oid: Oid, 133 | #[serde(rename = "Table Name")] 134 | pub table_name: String, 135 | #[serde(rename = "Column Name")] 136 | pub column_name: String, 137 | #[serde(rename = "Column Ordinal Position")] 138 | pub column_ordinal_position: i16, 139 | #[serde(rename = "Column Type")] 140 | pub column_type_name: String, 141 | } 142 | 143 | impl ColumnData { 144 | pub fn get_column(&self) -> (Oid, i16) { 145 | (self.table_oid, self.column_ordinal_position) 146 | } 147 | } 148 | -------------------------------------------------------------------------------- /extension/src/model/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod source_objects; 2 | pub mod dv_schema; 3 | pub mod queries; 4 | pub mod prompt_template; -------------------------------------------------------------------------------- /extension/src/model/prompt_template.rs: -------------------------------------------------------------------------------- 1 | #[derive(Debug)] 2 | pub enum PromptTemplate { 3 | BKComponentIdentification, 4 | BKName, 5 | DescriptorSensitive, 6 | } 7 | 8 | impl PromptTemplate { 9 | pub fn template(&self) -> &str { 10 | match self { 11 | PromptTemplate::BKComponentIdentification => r#" 12 | Task Title: Business Key Component Identification by Column in JSON Source Table Object 13 | 14 | You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to evaluate if a specified column is a business key component and, if so, how likely it is. The results of your evaluations will be used to create downstream data vault structures. 15 | 16 | A business key component is an attribute that forms part of a business key, which may be either a component of a composite key or a single key that uniquely identifies the record set. Additionally, there may be multiple business keys within one table. 17 | 18 | Requested Task: 19 | 20 | Determine whether the specified column, identified by its column number (“column no”), is likely to represent a business key or a component of a business key. 21 | 22 | Request Details: 23 | 24 | If the column is a primary key, as indicated in the comments or column details, assume it is a business key component. However, this does not exclude the possibility of other business key components within the table, but it may reduce the likelihood of the specified column being the only business key. 25 | 26 | If the specified column could be categorized as an email or username, only consider it a business key component if there are no other attributes in the table that could reasonably serve as a business key component. 27 | 28 | Use the column comments, when available, as the primary source of definition, providing direct context from business users. These comments should take priority over the column’s name or data type in determining its purpose and usage. 29 | 30 | Confidence Value: 31 | 32 | Provide a confidence score between 0 and 1, rounded to two decimal places, representing your confidence in the likelihood that the column is a business key component. A value of 0.80 or higher is considered reasonably confident. 33 | 34 | Reason: 35 | 36 | Indicate why you made the decision you did. 37 | 38 | Output: 39 | 40 | Ensure the output conforms to the format shown in the examples below. 41 | 42 | Example Input 1) 43 | JSON Source Table Object: 44 | { 45 | "Schema Name": "public", 46 | "Table Name": "customer", 47 | "Column Details": [ 48 | "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", 49 | "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", 50 | "Column No: 3 Named: state of type: character(2) Column Comments: NA", 51 | "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" 52 | ] 53 | } 54 | 55 | Column No: 1 56 | 57 | Example Output 1) 58 | { 59 | "Business Key Component Identification": { 60 | "Is Business Key Component": true, 61 | "Confidence Value": 0.95, 62 | "Reason": "The 'customer_id' column is designated as the primary key, which is typically the best candidate for a business key component in the 'customer' table." 63 | } 64 | } 65 | 66 | Example Input 2) 67 | JSON Source Table Object: 68 | { 69 | "Schema Name": "sales", 70 | "Table Name": "order_details", 71 | "Column Details": [ 72 | "Column No: 1 Named: id of type: integer Column Comments: NA", 73 | "Column No: 2 Named: product_id of type: integer Column Comments: NA", 74 | "Column No: 3 Named: quantity of type: integer Column Comments: NA", 75 | "Column No: 4 Named: order_date of type: date Column Comments: NA" 76 | ] 77 | } 78 | 79 | Column No: 1 80 | 81 | Example Output 2) 82 | { 83 | "Business Key Component Identification": { 84 | "Is Business Key Component": true, 85 | "Confidence Value": 0.75, 86 | "Reason": "Although 'id' is not explicitly marked as a primary key, it is likely to uniquely identify each order detail, making it a strong candidate for a business key component." 87 | } 88 | } 89 | 90 | Example Input 3) 91 | JSON Source Table Object: 92 | { 93 | "Schema Name": "sales", 94 | "Table Name": "order_details", 95 | "Column Details": [ 96 | "Column No: 1 Named: order_id of type: integer Column Comments: NA", 97 | "Column No: 2 Named: product_id of type: integer Column Comments: NA", 98 | "Column No: 3 Named: quantity of type: integer Column Comments: NA", 99 | "Column No: 4 Named: order_date of type: date Column Comments: NA" 100 | ] 101 | } 102 | 103 | Column No: 1 104 | 105 | Example Output 3) 106 | { 107 | "Business Key Component Identification": { 108 | "Is Business Key Component": true, 109 | "Confidence Value": 0.85, 110 | "Reason": "The 'order_id' column likely represents the primary identifier for each order within the 'order_details' table. Although it is not explicitly marked as a primary key, 'order_id' is a common identifier for business entities, making it a strong candidate for a business key component." 111 | } 112 | } 113 | 114 | Example Input 4) 115 | JSON Source Table Object: 116 | { 117 | "Schema Name": "sales", 118 | "Table Name": "order_details", 119 | "Column Details": [ 120 | "Column No: 1 Named: order_id of type: integer Column Comments: NA", 121 | "Column No: 2 Named: product_id of type: integer Column Comments: NA", 122 | "Column No: 3 Named: quantity of type: integer Column Comments: NA", 123 | "Column No: 4 Named: order_date of type: date Column Comments: NA" 124 | ] 125 | } 126 | 127 | Column No: 2 128 | 129 | 130 | Example Output 4) 131 | { 132 | "Business Key Component Identification": { 133 | "Is Business Key Component": true, 134 | "Confidence Value": 0.80, 135 | "Reason": "'product_id' likely represents a key component that helps identify specific products associated with the order. It is not the sole key but may serve as part of a composite business key alongside 'order_id'." 136 | } 137 | } 138 | 139 | 140 | Example Input 5) 141 | JSON Source Table Object: 142 | { 143 | "Schema Name": "sales", 144 | "Table Name": "order_details", 145 | "Column Details": [ 146 | "Column No: 1 Named: order_id of type: integer Column Comments: NA", 147 | "Column No: 2 Named: product_id of type: integer Column Comments: NA", 148 | "Column No: 3 Named: quantity of type: integer Column Comments: NA", 149 | "Column No: 4 Named: order_date of type: date Column Comments: NA" 150 | ] 151 | } 152 | 153 | Column No: 3 154 | 155 | Example Output 5) 156 | { 157 | "Business Key Component Identification": { 158 | "Is Business Key Component": false, 159 | "Confidence Value": 0.30, 160 | "Reason": "The 'quantity' column represents a numeric value related to the number of products in the order, but it does not uniquely identify the record. It is unlikely to serve as a business key component." 161 | } 162 | } 163 | 164 | 165 | Example Input 6) 166 | JSON Source Table Object: 167 | { 168 | "Schema Name": "sales", 169 | "Table Name": "order_details", 170 | "Column Details": [ 171 | "Column No: 1 Named: order_id of type: integer Column Comments: NA", 172 | "Column No: 2 Named: product_id of type: integer Column Comments: NA", 173 | "Column No: 3 Named: quantity of type: integer Column Comments: NA", 174 | "Column No: 4 Named: order_date of type: date Column Comments: NA" 175 | ] 176 | } 177 | 178 | Column No: 4 179 | 180 | Example Output 6) 181 | { 182 | "Business Key Component Identification": { 183 | "Is Business Key Component": false, 184 | "Confidence Value": 0.40, 185 | "Reason": "'order_date' represents the date on which the order was placed. While it provides important context, it is not unique to the record and is therefore unlikely to serve as a business key component." 186 | } 187 | } 188 | 189 | Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} 190 | 191 | JSON Source Table Object: {new_json} 192 | 193 | Column No: {column_no} 194 | "#, 195 | PromptTemplate::BKName => r#" 196 | Task Title: Business Key Naming in JSON Source Table Object with specified Column 197 | 198 | You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables. 199 | 200 | Requested Task: Identify the business key name. The business key part column has already been identified, and its associated column number, “column no”, will be provided along with the JSON Source Table Object. Return a name that best represents the business key from a data vault perspective. 201 | 202 | Request Details: 203 | 204 | The Business Key Name should be crafted based on the attribute linked to the business key, as identified by the provided column number. Prioritize the attribute name over the table name if the attribute name is descriptive enough. It should clearly represent the core business entity, avoiding generic terms like “ID,” “number,” or “Entity.” The name should focus solely on the business aspect, using terms like “customer,” “employee,” or “seller” that directly reflect the entity’s purpose, without unnecessary suffixes or identifiers. If the attribute associated with the business key or its column comments are not descriptive enough, the table name or schema name can be used to help formulate the Business Key Name. 205 | 206 | Use the column comments, when available, as the primary source of definition, providing direct context from business users. These comments should take priority over the column’s name or data type in determining its business key name. 207 | 208 | Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your chosen Business Key Name. A value of 0.80 or higher is considered reasonably confident. 209 | 210 | Reason: Indicate why you made the decision you did. 211 | 212 | Output: Ensure the output conforms to the format shown in the examples below. 213 | 214 | Example Input 1) 215 | JSON Source Table Object: 216 | { 217 | "Schema Name": "public", 218 | "Table Name": "customer", 219 | "Column Details": [ 220 | "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", 221 | "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", 222 | "Column No: 3 Named: state of type: character(2) Column Comments: NA", 223 | "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" 224 | ] 225 | } 226 | 227 | Column No: 1 228 | 229 | Example Output 1) 230 | { 231 | "Business Key Name": { 232 | "Name": "Customer", 233 | "Confidence Value": 0.9, 234 | "Reason": "The column 'customer_id' is a primary key and represents the unique identifier for customers in the 'customer' table. Given that the table name 'customer' directly reflects the business entity, 'Customer' is chosen as the Business Key Name. The confidence value is high because the identifier is straightforward and strongly aligned with the core business entity." 235 | } 236 | } 237 | 238 | Example Input 2) 239 | JSON Source Table Object: 240 | { 241 | "Schema Name": "sales", 242 | "Table Name": "order_details", 243 | "Column Details": [ 244 | "Column No: 1 Named: id of type: integer Column Comments: NA", 245 | "Column No: 2 Named: product_id of type: integer Column Comments: NA", 246 | "Column No: 3 Named: quantity of type: integer Column Comments: NA", 247 | "Column No: 4 Named: order_date of type: date Column Comments: NA" 248 | ] 249 | } 250 | 251 | Column No: 1 252 | 253 | Example Output 2) 254 | { 255 | "Business Key Name": { 256 | "Name": "Order", 257 | "Confidence Value": 0.85, 258 | "Reason": "The column 'id' is a primary key and serves as the unique identifier for records in the 'order_details' table. Although the column name 'id' is generic, the table name 'order_details' indicates that the records pertain to individual orders. Therefore, 'Order' is chosen as the Business Key Name to best represent the core business entity. The confidence value is slightly lower due to the generic nature of the column name, but it is still reasonably confident given the context provided by the table name." 259 | } 260 | } 261 | 262 | Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints} 263 | 264 | JSON Source Table Object: {new_json} 265 | 266 | Column No: {column_no} 267 | "#, 268 | PromptTemplate::DescriptorSensitive => r#" 269 | Task Title: Identification of PII in JSON Source Table Object 270 | 271 | You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to assist in the creation of downstream data vault tables by performing the requested tasks based on this information. 272 | 273 | Requested Task: Identify if the descriptor is a descriptor sensitive PII subtype. A descriptor column, along with its associated column number (“column no”), will be provided in the JSON Source Table Object. If you determine that the column contains Personally Identifiable Information (PII), categorize it as “Descriptor - Sensitive.” 274 | 275 | Request Details: 276 | PII Identification: Only consider a column as PII if it directly matches an item from the PII list provided below. Do not infer or project beyond this list. If a column name or its associated comment closely resembles an item from the list, classify it as PII. 277 | No Overgeneralization: Avoid overgeneralization or inference beyond what is explicitly stated in the list. Focus strictly on the provided PII list. 278 | 279 | Use the column comments, when available, as the primary source of definition, providing direct context from business users. These comments should take priority over the column’s name or data type in determining its identity. 280 | 281 | Personal Identifiable Information (PII) List: 282 | 283 | Consider any of the following types of information as PII and categorize the corresponding column as “Descriptor - Sensitive”: 284 | 285 | - Person’s Name: PII (Includes first name, last name, or both). 286 | - Social Security Number (SSN): PII 287 | - Driver’s License Number: PII 288 | - Passport Number: PII 289 | - Email Address: PII 290 | - Physical Street Address: PII (Includes street address, but excludes City, State, or standard 5-digit Zip code). 291 | - Extended Zip Code: PII (Any Zip code with more than 5 digits). 292 | - Telephone Number: PII (Includes both landline and mobile numbers). 293 | - Date of Birth: PII 294 | - Place of Birth: PII 295 | - Biometric Data: PII (Includes fingerprints, facial recognition data, iris scans). 296 | - Medical Information: PII (Includes health records, prescriptions). 297 | - Financial Information: PII (Includes bank account numbers, credit card numbers, debit card numbers). 298 | - Employment Information: PII (Includes employment records, salary information). 299 | - Insurance Information: PII (Includes policy numbers, claim information). 300 | - Education Records: PII (Includes student records, transcripts). 301 | - Online Identifiers: PII (Includes usernames, IP addresses, cookies, MAC addresses). 302 | - Photographs or Videos: PII (Any media that can identify an individual). 303 | - National Identification Numbers: PII (Includes identifiers outside of SSN, such as National Insurance Numbers in the UK). 304 | - Geolocation Data: PII (Includes GPS coordinates, location history). 305 | - Vehicle Registration Numbers: PII 306 | 307 | Not PII: 308 | 309 | Some data may seem personally identifiable; however, it is not specific enough to identify an individual. 310 | 311 | - Standard 5-Digit Zip Code: Not PII 312 | - City: Not PII 313 | - State: Not PII 314 | - Country: Not PII 315 | - Age (in years): Not PII (Unless combined with other identifiers like date of birth). 316 | - Date or Timestamp (Example: created_date, created_timestamp, update_Date, update_timestamp): Not PII (Unless combined with other identiviers like date of birth) 317 | - Gender: Not PII 318 | - Ethnicity/Race: Not PII (General categories, e.g., “Caucasian,” “Asian,” without additional identifiers). 319 | - Publicly Available Information: Not PII (Any information that is lawfully made available from federal, state, or local government records). 320 | - Generic Job Titles: Not PII (Titles like “Manager,” “Engineer,” without additional identifying details). 321 | - Company/Organization Name: Not PII (Names of companies or organizations without personal identifiers). 322 | 323 | Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your “Is PII” determination of true or false. A value of 0.80 or higher is considered reasonably confident in your true or false answer. 324 | 325 | 326 | Reason: Indicate why you made the decision you did. 327 | 328 | Output: Please ensure that your output is JSON and matches the structure of the output examples provided. 329 | 330 | Example Input 1) 331 | JSON Source Table Object: 332 | { 333 | "Schema Name": "public", 334 | "Table Name": "customer", 335 | "Column Details": [ 336 | "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", 337 | "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", 338 | "Column No: 3 Named: state of type: character(2) Column Comments: NA", 339 | "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" 340 | ] 341 | } 342 | 343 | Column No: 4 344 | 345 | Example Output 1) 346 | { 347 | "Descriptor - Sensitive": { 348 | "Is PII": true, 349 | "Confidence Value": 0.85, 350 | "Reason": "The 'zip' column is identified as PII because its data type, character varying(10), allows for the possibility of storing extended zip codes, which matches an item on the provided PII list." 351 | } 352 | } 353 | 354 | Example Input 2) 355 | JSON Source Table Object: 356 | { 357 | "Schema Name": "public", 358 | "Table Name": "customer", 359 | "Column Details": [ 360 | "Column No: 1 Named: customer_id of type: uuid And is a primary key. Column Comments: NA", 361 | "Column No: 2 Named: city of type: character varying(255) Column Comments: NA", 362 | "Column No: 3 Named: state of type: character(2) Column Comments: NA", 363 | "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA" 364 | ] 365 | } 366 | 367 | Column No: 2 368 | 369 | Example Output 2) 370 | { 371 | "Descriptor - Sensitive": { 372 | "Is PII": false, 373 | "Confidence Value": 0.90, 374 | "Reason": "The 'city' column is not considered PII because city names do not match any item on the provided PII list." 375 | } 376 | } 377 | 378 | Example Input 3) 379 | JSON Source Table Object: 380 | { 381 | "Schema Name": "public", 382 | "Table Name": "employee", 383 | "Column Details": [ 384 | "Column No: 1 Named: employee_id of type: uuid And is a primary key. Column Comments: NA", 385 | "Column No: 2 Named: full_name of type: character varying(255) Column Comments: NA", 386 | "Column No: 3 Named: email of type: character varying(255) Column Comments: NA", 387 | "Column No: 4 Named: salary of type: numeric Column Comments: NA" 388 | ] 389 | } 390 | 391 | Column No: 2 392 | 393 | Example Output 3) 394 | { 395 | "Descriptor - Sensitive": { 396 | "Is PII": true, 397 | "Confidence Value": 0.95, 398 | "Reason": "The 'full_name' column is identified as PII because it matches the 'Person's Name' item from the provided PII list." 399 | } 400 | } 401 | 402 | Example Input 4) 403 | JSON Source Table Object: 404 | { 405 | "Schema Name": "public", 406 | "Table Name": "order", 407 | "Column Details": [ 408 | "Column No: 1 Named: order_id of type: uuid And is a primary key. Column Comments: NA", 409 | "Column No: 2 Named: order_date of type: date Column Comments: NA", 410 | "Column No: 3 Named: customer_email of type: character varying(255) Column Comments: 'Email address of the customer who placed the order'", 411 | "Column No: 4 Named: total_amount of type: numeric Column Comments: NA" 412 | ] 413 | } 414 | 415 | Column No: 3 416 | 417 | Example Output 4) 418 | { 419 | "Descriptor - Sensitive": { 420 | "Is PII": true, 421 | "Confidence Value": 0.98, 422 | "Reason": "The 'customer_email' column is identified as PII because it matches the 'Email Address' item from the provided PII list." 423 | } 424 | } 425 | 426 | Now, based on the instructions and examples above, please generate the appropriate JSON output only for the following JSON Source Table Object and Column No inputs. {hints} 427 | 428 | JSON Source Table Object: {new_json} 429 | 430 | Column No: {column_no} 431 | 432 | "#, 433 | } 434 | } 435 | } -------------------------------------------------------------------------------- /extension/src/model/queries.rs: -------------------------------------------------------------------------------- 1 | use crate::utility::guc; 2 | 3 | pub const SOURCE_OBJECTS_JSON: &str = r#" 4 | WITH 5 | table_tranformation_time_cal AS ( 6 | SELECT 7 | s.table_oid, 8 | MAX(s.valid_from) AS max_table_update, 9 | MAX(t.created_at) AS max_table_transformer_generation 10 | FROM auto_dw.source_objects AS s 11 | LEFT JOIN auto_dw.transformer_responses AS t ON s.pk_source_objects = t.fk_source_objects 12 | WHERE current_flag = 'Y' AND deleted_flag = 'N' 13 | GROUP BY table_oid), 14 | tables_requiring_transformation AS ( 15 | SELECT DISTINCT table_oid FROM table_tranformation_time_cal 16 | WHERE (max_table_update > max_table_transformer_generation) OR max_table_transformer_generation IS NULL 17 | ), 18 | source_table_details AS ( 19 | SELECT s.* 20 | FROM auto_dw.source_objects AS s 21 | JOIN tables_requiring_transformation AS t ON s.table_oid = t.table_oid 22 | WHERE current_flag = 'Y' AND deleted_flag = 'N' 23 | ), 24 | source_prep AS ( 25 | SELECT 26 | table_oid, 27 | column_ordinal_position, 28 | json_build_object( 29 | 'PK Source Objects', pk_source_objects, 30 | 'Column Ordinal Position', column_ordinal_position 31 | ) AS column_link, 32 | schema_name, table_name, 33 | 'Column No: ' || column_ordinal_position || ' ' || 34 | 'Named: ' || column_name || ' ' || 35 | 'of type: ' || column_type_name || ' ' || 36 | CASE 37 | WHEN column_pk_ind =1 THEN 'And is a primary key.' ELSE '' 38 | END || 39 | 'Column Comments: ' || column_description 40 | AS column_details 41 | FROM source_table_details 42 | ) 43 | SELECT 44 | table_oid, 45 | json_build_object( 46 | 'Column Links', array_agg(column_link ORDER BY column_ordinal_position ASC) 47 | ) AS table_column_links, 48 | json_build_object( 49 | 'Schema Name', schema_name, 50 | 'Table Name', table_name, 51 | 'Column Details', array_agg(column_details ORDER BY column_ordinal_position ASC) 52 | ) AS table_details 53 | FROM source_prep 54 | GROUP BY table_oid, schema_name, table_name 55 | ; 56 | "#; 57 | 58 | #[no_mangle] 59 | pub fn source_object_dw(schema_pattern_include: &str, table_pattern_include: &str, column_pattern_include: &str, schema_pattern_exclude: &str, table_pattern_exclude: &str, column_pattern_exclude: &str) -> String { 60 | 61 | let dw_schema = guc::get_guc(guc::PgAutoDWGuc::DwSchema).expect("DW SCHEMA GUC is not set."); 62 | 63 | format!(r#" 64 | DROP TABLE IF EXISTS temp_source_objects; 65 | 66 | CREATE TEMPORARY TABLE temp_source_objects AS 67 | WITH 68 | schema_qry AS ( 69 | SELECT 70 | pg_namespace.oid AS schema_oid, 71 | pg_namespace.nspname AS schema_name, 72 | pg_description.description AS schema_description 73 | FROM pg_catalog.pg_namespace 74 | LEFT JOIN pg_catalog.pg_description ON pg_namespace.oid = pg_description.objoid AND 75 | pg_description.objsubid = 0 -- No Sub Objects 76 | WHERE pg_namespace.nspname !~ 'pg_.*' AND pg_namespace.nspname NOT IN ('information_schema', 'auto_dw', '{dw_schema}') 77 | ), 78 | table_qry AS ( 79 | SELECT 80 | pg_class.oid AS table_oid, 81 | pg_class.relname AS table_name, 82 | pg_class.relnamespace AS table_schema_oid, 83 | pg_description.description AS table_description 84 | FROM pg_catalog.pg_class 85 | LEFT JOIN pg_catalog.pg_description ON pg_class.oid = pg_description.objoid AND 86 | pg_description.objsubid = 0 -- No Sub Objects 87 | WHERE 88 | pg_class.relkind IN ('r', 'f') -- 'r' stands for ordinary table, 'f' stands for foreign data wrapper 89 | ), 90 | column_qry AS ( 91 | SELECT 92 | pg_attribute.attrelid AS column_table_oid, 93 | pg_attribute.attname AS column_name, 94 | pg_attribute.attnum AS column_ordinal_position, 95 | pg_attribute.atttypid AS column_type_oid, 96 | pg_attribute.atttypmod AS column_modification_number, 97 | pg_catalog.format_type(atttypid, atttypmod) AS column_type_name, 98 | pg_description.description AS column_description 99 | FROM pg_attribute 100 | LEFT JOIN pg_catalog.pg_description ON pg_attribute.attrelid = pg_description.objoid AND 101 | pg_attribute.attnum = pg_description.objsubid 102 | WHERE 103 | pg_attribute.attnum > 0 -- Only real columns, not system columns 104 | AND NOT pg_attribute.attisdropped -- Only columns that are not dropped 105 | ), 106 | type_qry AS ( 107 | SELECT 108 | oid AS type_oid, 109 | typname AS base_type_name 110 | FROM pg_type 111 | ), 112 | pk_table_column_qry AS ( 113 | SELECT 114 | conrelid AS table_oid, 115 | unnest(conkey) AS column_ordinal_position, 116 | 1 AS column_pk_ind, 117 | conname AS column_pk_name 118 | FROM 119 | pg_constraint 120 | WHERE 121 | contype = 'p' 122 | ), 123 | fk_table_column_qry AS ( 124 | SELECT DISTINCT -- Distinct one column could have multiple FKs. 125 | conrelid AS table_oid, 126 | unnest(conkey) AS column_ordinal_position, 127 | 1 AS column_fk_ind 128 | FROM 129 | pg_constraint 130 | WHERE 131 | contype = 'f' 132 | ), 133 | source_objects_prep AS ( 134 | SELECT 135 | schema_qry.schema_oid, 136 | schema_qry.schema_name, 137 | schema_qry.schema_description, 138 | table_qry.table_oid, 139 | table_qry.table_name, 140 | COALESCE(table_qry.table_description, 'NA') AS table_description, 141 | column_qry.column_ordinal_position, 142 | column_qry.column_name, 143 | type_qry.base_type_name AS column_base_type_name, 144 | column_qry.column_modification_number, 145 | column_qry.column_type_name, 146 | COALESCE(column_qry.column_description, 'NA') AS column_description, 147 | COALESCE(pk_table_column_qry.column_pk_ind, 0) AS column_pk_ind, 148 | COALESCE(pk_table_column_qry.column_pk_name, 'NA') AS column_pk_name, 149 | COALESCE(fk_table_column_qry.column_fk_ind, 0) AS column_fk_ind 150 | FROM schema_qry 151 | LEFT JOIN table_qry ON schema_qry.schema_oid = table_qry.table_schema_oid 152 | LEFT JOIN column_qry ON table_qry.table_oid = column_qry.column_table_oid 153 | LEFT JOIN type_qry ON column_qry.column_type_oid = type_qry.type_oid 154 | LEFT JOIN pk_table_column_qry ON 155 | table_qry.table_oid = pk_table_column_qry.table_oid AND 156 | column_qry.column_ordinal_position = pk_table_column_qry.column_ordinal_position 157 | LEFT JOIN fk_table_column_qry ON 158 | table_qry.table_oid = fk_table_column_qry.table_oid AND 159 | column_qry.column_ordinal_position = fk_table_column_qry.column_ordinal_position 160 | ), 161 | table_source_list AS ( 162 | -- Currently on List 163 | SELECT 164 | schema_oid, 165 | table_oid, 166 | column_ordinal_position 167 | FROM auto_dw.source_objects 168 | WHERE current_flag = 'Y' AND deleted_flag = 'N' 169 | -- Adding TABLE COLUMNS 170 | UNION 171 | SELECT 172 | schema_oid, 173 | table_oid, 174 | column_ordinal_position 175 | FROM source_objects_prep 176 | -- 'a^' ~ mach nothing. 177 | WHERE 178 | schema_name ~ '{}' AND 179 | table_name ~ '{}' AND 180 | column_name ~ '{}' 181 | --- Removing Schemas 182 | EXCEPT 183 | SELECT 184 | schema_oid, 185 | table_oid, 186 | column_ordinal_position 187 | FROM source_objects_prep 188 | WHERE 189 | schema_name ~ '{}' AND 190 | table_name ~ '{}' AND 191 | column_name ~ '{}' 192 | ) 193 | SELECT 194 | source_objects_prep.schema_oid, 195 | source_objects_prep.schema_name, 196 | source_objects_prep.schema_description, 197 | source_objects_prep.table_oid, 198 | source_objects_prep.table_name, 199 | source_objects_prep.table_description, 200 | source_objects_prep.column_ordinal_position, 201 | source_objects_prep.column_name, 202 | source_objects_prep.column_base_type_name, 203 | source_objects_prep.column_modification_number, 204 | source_objects_prep.column_type_name, 205 | source_objects_prep.column_description, 206 | source_objects_prep.column_pk_ind, 207 | source_objects_prep.column_pk_name, 208 | source_objects_prep.column_fk_ind 209 | FROM source_objects_prep 210 | JOIN table_source_list ON 211 | source_objects_prep.schema_oid = table_source_list.schema_oid AND -- Remove to track tables even if they move schemas. 212 | source_objects_prep.table_oid = table_source_list.table_oid AND 213 | source_objects_prep.column_ordinal_position = table_source_list.column_ordinal_position 214 | ORDER BY source_objects_prep.schema_name, source_objects_prep.table_name, source_objects_prep.column_ordinal_position 215 | ; 216 | 217 | -- Mark anything that was deleted. 218 | UPDATE auto_dw.source_objects 219 | SET deleted_flag = 'Y' 220 | WHERE source_objects.current_flag = 'Y' 221 | AND NOT EXISTS ( 222 | SELECT 1 223 | FROM temp_source_objects 224 | WHERE source_objects.schema_oid = temp_source_objects.schema_oid 225 | AND source_objects.table_oid = temp_source_objects.table_oid 226 | AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position 227 | ); 228 | 229 | -- If anything associated with current columns change set the current_flg to 'N' 230 | UPDATE auto_dw.source_objects 231 | SET valid_to = (now() AT TIME ZONE 'UTC'), current_flag = 'N' 232 | FROM temp_source_objects 233 | WHERE source_objects.current_flag = 'Y' 234 | AND source_objects.schema_oid = temp_source_objects.schema_oid 235 | AND source_objects.table_oid = temp_source_objects.table_oid 236 | AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position 237 | AND ( 238 | source_objects.schema_name IS DISTINCT FROM temp_source_objects.schema_name OR 239 | source_objects.schema_description IS DISTINCT FROM temp_source_objects.schema_description OR 240 | source_objects.table_name IS DISTINCT FROM temp_source_objects.table_name OR 241 | source_objects.table_description IS DISTINCT FROM temp_source_objects.table_description OR 242 | source_objects.column_name IS DISTINCT FROM temp_source_objects.column_name OR 243 | source_objects.column_base_type_name IS DISTINCT FROM temp_source_objects.column_base_type_name OR 244 | source_objects.column_modification_number IS DISTINCT FROM temp_source_objects.column_modification_number OR 245 | source_objects.column_type_name IS DISTINCT FROM temp_source_objects.column_type_name OR 246 | source_objects.column_description IS DISTINCT FROM temp_source_objects.column_description OR 247 | source_objects.column_pk_ind IS DISTINCT FROM temp_source_objects.column_pk_ind OR 248 | source_objects.column_pk_name IS DISTINCT FROM temp_source_objects.column_pk_name OR 249 | source_objects.column_fk_ind IS DISTINCT FROM temp_source_objects.column_fk_ind 250 | ); 251 | 252 | -- If anything that was deleted from the prior record set comes back. 253 | UPDATE auto_dw.source_objects 254 | SET deleted_flag = 'N' 255 | FROM temp_source_objects 256 | WHERE source_objects.current_flag = 'Y' AND source_objects.deleted_flag = 'Y' 257 | AND source_objects.schema_oid = temp_source_objects.schema_oid 258 | AND source_objects.table_oid = temp_source_objects.table_oid 259 | AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position 260 | AND ( 261 | source_objects.schema_name = temp_source_objects.schema_name OR 262 | source_objects.schema_description = temp_source_objects.schema_description OR 263 | source_objects.table_name = temp_source_objects.table_name OR 264 | source_objects.table_description = temp_source_objects.table_description OR 265 | source_objects.column_name = temp_source_objects.column_name OR 266 | source_objects.column_base_type_name = temp_source_objects.column_base_type_name OR 267 | source_objects.column_modification_number = temp_source_objects.column_modification_number OR 268 | source_objects.column_type_name = temp_source_objects.column_type_name OR 269 | source_objects.column_description = temp_source_objects.column_description OR 270 | source_objects.column_pk_ind = temp_source_objects.column_pk_ind OR 271 | source_objects.column_pk_name = temp_source_objects.column_pk_name OR 272 | source_objects.column_fk_ind = temp_source_objects.column_fk_ind 273 | ); 274 | 275 | -- Inserting new records. 276 | INSERT INTO auto_dw.source_objects ( 277 | schema_oid, 278 | schema_name, 279 | schema_description, 280 | table_oid, 281 | table_name, 282 | table_description, 283 | column_ordinal_position, 284 | column_name, 285 | column_base_type_name, 286 | column_modification_number, 287 | column_type_name, 288 | column_description, 289 | column_pk_ind, 290 | column_pk_name, 291 | column_fk_ind 292 | ) 293 | SELECT 294 | temp_source_objects.schema_oid, 295 | temp_source_objects.schema_name, 296 | temp_source_objects.schema_description, 297 | temp_source_objects.table_oid, 298 | temp_source_objects.table_name, 299 | temp_source_objects.table_description, 300 | temp_source_objects.column_ordinal_position, 301 | temp_source_objects.column_name, 302 | temp_source_objects.column_base_type_name, 303 | temp_source_objects.column_modification_number, 304 | temp_source_objects.column_type_name, 305 | temp_source_objects.column_description, 306 | temp_source_objects.column_pk_ind, 307 | temp_source_objects.column_pk_name, 308 | temp_source_objects.column_fk_ind 309 | FROM temp_source_objects 310 | LEFT JOIN auto_dw.source_objects ON source_objects.current_flag = 'Y' 311 | AND source_objects.schema_oid = temp_source_objects.schema_oid 312 | AND source_objects.table_oid = temp_source_objects.table_oid 313 | AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position 314 | WHERE source_objects.column_ordinal_position IS NULL; 315 | 316 | DROP TABLE IF EXISTS temp_source_objects; 317 | "#, schema_pattern_include, table_pattern_include, column_pattern_include, schema_pattern_exclude, table_pattern_exclude, column_pattern_exclude) 318 | } 319 | 320 | #[no_mangle] 321 | pub fn insert_into_build_call( 322 | build_id: &str, build_flag: &str, build_status: &str, status: &str, accepted_transformer_confidence_level: &str 323 | ) -> String { 324 | format!(r#" 325 | INSERT INTO auto_dw.build_call (fk_transformer_responses, build_id, build_flag, build_status) 326 | WITH 327 | confidence_level AS (SELECT {accepted_transformer_confidence_level} AS value), 328 | source_objects_tranformation_cal AS ( 329 | SELECT 330 | MAX(pk_transformer_responses)AS max_pk_transformer_response 331 | FROM auto_dw.transformer_responses AS t 332 | GROUP BY fk_source_objects 333 | ), 334 | source_object_transformation_latest AS ( 335 | SELECT t.* FROM auto_dw.transformer_responses AS t 336 | JOIN source_objects_tranformation_cal AS c ON t.pk_transformer_responses = c.max_pk_transformer_response 337 | ), 338 | source_object_status_prep AS ( 339 | SELECT 340 | t.pk_transformer_responses, 341 | s.schema_name, 342 | s.table_name, 343 | s.column_name, 344 | s.column_ordinal_position, 345 | t.confidence_score, 346 | t.reason, 347 | t.category, 348 | t.model_name, 349 | CASE 350 | WHEN dws.column_ordinal_position IS NOT NULL THEN true ELSE false 351 | END AS is_dw, 352 | MAX( 353 | CASE 354 | WHEN t.category = 'Business Key Part' AND t.confidence_score < cl.value THEN 1 355 | ELSE 0 356 | END 357 | ) OVER (PARTITION BY s.schema_name, s.table_name) AS bk_hold, 358 | SUM( 359 | CASE 360 | WHEN t.category = 'Business Key Part' THEN 1 361 | ELSE 0 362 | END 363 | ) OVER (PARTITION BY s.schema_name, s.table_name) AS bkp_cnt 364 | FROM auto_dw.source_objects AS s 365 | JOIN confidence_level AS cl ON true 366 | LEFT JOIN auto_dw.dw_source_objects AS dws ON s.table_oid = dws.table_oid AND s.column_ordinal_position = dws.column_ordinal_position 367 | LEFT JOIN source_object_transformation_latest AS t ON s.pk_source_objects = t.fk_source_objects 368 | WHERE s.current_flag = 'Y' AND s.deleted_flag = 'N' 369 | ), 370 | source_object AS ( 371 | SELECT *, 372 | CASE 373 | WHEN is_dw THEN 'Built' 374 | WHEN confidence_score IS NULL THEN 'Queued for Processing' 375 | -- Links 376 | WHEN category = 'Business Key Part' AND confidence_score >= cl.value AND bkp_cnt > 1 THEN 'Ready to Deploy' 377 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 AND bkp_cnt > 1 THEN 'Ready to Deploy' 378 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 AND bkp_cnt > 1 THEN 'Ready to Deploy - Awaiting Business Key (BK)' 379 | -- Hubs 380 | WHEN category = 'Business Key Part' AND confidence_score >= cl.value THEN 'Ready to Deploy' 381 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 THEN 'Ready to Deploy' 382 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 THEN 'Ready to Deploy - Awaiting Business Key (BK)' 383 | 384 | ELSE 'Requires Attention' 385 | END AS status, 386 | CASE 387 | WHEN confidence_score IS NOT NULL THEN CONCAT((confidence_score * 100)::INT::TEXT, '%') 388 | ELSE '-' 389 | END AS confidence_level, 390 | CASE 391 | WHEN confidence_score IS NOT NULL THEN 392 | ( 393 | 'Status: ' || 394 | CASE 395 | WHEN confidence_score IS NULL THEN 'Queued for Processing' 396 | WHEN confidence_score >= cl.value THEN 'Ready to Deploy' 397 | ELSE 'Requires Attention' 398 | END || ': ' || 399 | 'Model: ' || model_name || 400 | ' categorized this column as a ' || category || 401 | ' with a confidence of ' || CONCAT((confidence_score * 100)::INT::TEXT, '%') || '. ' || 402 | 'Model Reasoning: ' || reason 403 | ) 404 | ELSE '-' 405 | END AS status_response 406 | FROM source_object_status_prep 407 | JOIN confidence_level AS cl ON true 408 | ) 409 | SELECT 410 | pk_transformer_responses AS fk_transformer_responses, 411 | '{build_id}' AS build_id, 412 | '{build_flag}' AS build_flag, 413 | '{build_status}' AS build_status 414 | FROM source_object 415 | WHERE status = '{status}'; 416 | "#) 417 | } 418 | 419 | #[no_mangle] 420 | pub fn build_object_pull(build_id: &str) -> String { 421 | format!(r#" 422 | WITH system AS ( 423 | SELECT system_identifier AS id FROM pg_control_system() LIMIT 1 424 | ) 425 | SELECT 426 | schema_name::TEXT AS schema_name, 427 | table_name::TEXT AS table_name, 428 | category::TEXT AS column_category, 429 | business_key_name::TEXT AS business_key_name, 430 | column_name::TEXT AS column_name, 431 | column_type_name::TEXT AS column_type_name, 432 | system.id::BIGINT AS system_id, 433 | so.table_oid::OID as table_oid, 434 | so.column_ordinal_position::SMALLINT AS column_ordinal_position 435 | FROM system, auto_dw.build_call AS bc 436 | LEFT JOIN auto_dw.transformer_responses AS t ON bc.fk_transformer_responses = t.pk_transformer_responses 437 | LEFT JOIN auto_dw.source_objects AS so ON t.fk_source_objects = so.pk_source_objects 438 | WHERE build_id = '{}'; 439 | "#, build_id) 440 | } 441 | 442 | #[no_mangle] 443 | pub fn source_column(accepted_transformer_confidence_level: &str) -> String { 444 | format!(r#" 445 | WITH 446 | confidence_level AS (SELECT {accepted_transformer_confidence_level} AS value), 447 | source_objects_tranformation_cal AS ( 448 | SELECT 449 | MAX(pk_transformer_responses)AS max_pk_transformer_response 450 | FROM auto_dw.transformer_responses AS t 451 | GROUP BY fk_source_objects 452 | ), 453 | source_object_transformation_latest AS ( 454 | SELECT t.* FROM auto_dw.transformer_responses AS t 455 | JOIN source_objects_tranformation_cal AS c ON t.pk_transformer_responses = c.max_pk_transformer_response 456 | ), 457 | source_object_status_prep AS ( 458 | SELECT 459 | t.pk_transformer_responses, 460 | s.schema_name, 461 | s.schema_oid, 462 | s.table_name, 463 | s.table_oid, 464 | s.column_name, 465 | s.column_ordinal_position, 466 | t.confidence_score, 467 | t.reason, 468 | t.category, 469 | t.model_name, 470 | CASE 471 | WHEN dws.column_ordinal_position IS NOT NULL THEN true ELSE false 472 | END AS is_dw, 473 | MAX( 474 | CASE 475 | WHEN t.category = 'Business Key Part' AND t.confidence_score < cl.value THEN 1 476 | ELSE 0 477 | END 478 | ) OVER (PARTITION BY s.schema_name, s.table_name) AS bk_hold, 479 | SUM( 480 | CASE 481 | WHEN t.category = 'Business Key Part' THEN 1 482 | ELSE 0 483 | END 484 | ) OVER (PARTITION BY s.schema_name, s.table_name) AS bkp_cnt 485 | FROM auto_dw.source_objects AS s 486 | JOIN confidence_level AS cl ON true 487 | LEFT JOIN auto_dw.dw_source_objects AS dws ON s.table_oid = dws.table_oid AND s.column_ordinal_position = dws.column_ordinal_position 488 | LEFT JOIN source_object_transformation_latest AS t ON s.pk_source_objects = t.fk_source_objects 489 | WHERE s.current_flag = 'Y' AND s.deleted_flag = 'N' 490 | ), 491 | source_object AS ( 492 | SELECT *, 493 | CASE 494 | WHEN is_dw THEN 'Built' 495 | WHEN confidence_score IS NULL THEN 'Queued for Processing' 496 | -- Links 497 | WHEN category = 'Business Key Part' AND confidence_score >= cl.value AND bkp_cnt > 1 THEN 'Ready' 498 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 AND bkp_cnt > 1 THEN 'Ready' 499 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 AND bkp_cnt > 1 THEN 'Ready - Awaiting Business Key (BK)' 500 | -- Hubs 501 | WHEN category = 'Business Key Part' AND confidence_score >= cl.value THEN 'Ready' 502 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 THEN 'Ready' 503 | WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 THEN 'Ready - Awaiting Business Key (BK)' 504 | 505 | ELSE 'Requires Attention' 506 | END AS status, 507 | CASE 508 | WHEN confidence_score IS NOT NULL THEN CONCAT((confidence_score * 100)::INT::TEXT, '%') 509 | ELSE '-' 510 | END AS confidence_level, 511 | CASE 512 | WHEN confidence_score IS NOT NULL THEN 513 | ( 514 | 'Status: ' || 515 | CASE 516 | WHEN confidence_score IS NULL THEN 'Queued for Processing' 517 | WHEN confidence_score >= cl.value THEN 'Ready' 518 | ELSE 'Requires Attention' 519 | END || ': ' || 520 | 'Model: ' || model_name || 521 | ' categorized this column as a ' || category || 522 | ' with a confidence of ' || CONCAT((confidence_score * 100)::INT::TEXT, '%') || '. ' || 523 | 'Model Reasoning: ' || reason 524 | ) 525 | ELSE '-' 526 | END AS status_response 527 | FROM source_object_status_prep 528 | JOIN confidence_level AS cl ON true 529 | ) 530 | SELECT 531 | schema_name::TEXT AS schema, 532 | schema_oid AS schema_oid, 533 | table_name::TEXT AS table, 534 | table_oid AS table_oid, 535 | column_name::TEXT AS column, 536 | column_ordinal_position AS column_ordinal_position, 537 | status, 538 | CASE 539 | WHEN category IS NULL THEN '-' 540 | WHEN category LIKE 'Descriptor - Sensitive' THEN 'Descriptor' 541 | ELSE category 542 | END::TEXT AS category, 543 | CASE 544 | WHEN category IS NULL THEN '-' 545 | WHEN category LIKE 'Descriptor - Sensitive' THEN 'True' 546 | ELSE 'False' 547 | END::TEXT AS is_sensitive, 548 | confidence_level, 549 | status_response 550 | FROM source_object 551 | ORDER BY schema_name, table_name, column_ordinal_position 552 | ; 553 | "#) 554 | } 555 | 556 | #[no_mangle] 557 | pub fn get_column_data(schema_name: &str, table_name: &str, column_name: &str) -> String { 558 | format!(r#" 559 | WITH 560 | system_qry AS ( 561 | SELECT system_identifier AS id FROM pg_control_system() LIMIT 1 562 | ), 563 | schema_qry AS ( 564 | SELECT 565 | pg_namespace.oid AS schema_oid, 566 | pg_namespace.nspname AS schema_name, 567 | pg_description.description AS schema_description 568 | FROM pg_catalog.pg_namespace 569 | LEFT JOIN pg_catalog.pg_description ON pg_namespace.oid = pg_description.objoid AND 570 | pg_description.objsubid = 0 -- No Sub Objects 571 | WHERE pg_namespace.nspname !~ 'pg_.*' AND pg_namespace.nspname NOT IN ('information_schema', 'auto_dw') 572 | ), 573 | table_qry AS ( 574 | SELECT 575 | pg_class.oid AS table_oid, 576 | pg_class.relname AS table_name, 577 | pg_class.relnamespace AS table_schema_oid, 578 | pg_description.description AS table_description 579 | FROM pg_catalog.pg_class 580 | LEFT JOIN pg_catalog.pg_description ON pg_class.oid = pg_description.objoid AND 581 | pg_description.objsubid = 0 -- No Sub Objects 582 | WHERE 583 | pg_class.relkind IN ('r', 'f') -- 'r' stands for ordinary table, 'f' stands for foreign data wrapper 584 | ), 585 | column_qry AS ( 586 | SELECT 587 | pg_attribute.attrelid AS column_table_oid, 588 | pg_attribute.attname AS column_name, 589 | pg_attribute.attnum AS column_ordinal_position, 590 | pg_attribute.atttypid AS column_type_oid, 591 | pg_attribute.atttypmod AS column_modification_number, 592 | pg_catalog.format_type(atttypid, atttypmod) AS column_type_name, 593 | pg_description.description AS column_description 594 | FROM pg_attribute 595 | LEFT JOIN pg_catalog.pg_description ON pg_attribute.attrelid = pg_description.objoid AND 596 | pg_attribute.attnum = pg_description.objsubid 597 | WHERE 598 | pg_attribute.attnum > 0 -- Only real columns, not system columns 599 | AND NOT pg_attribute.attisdropped -- Only columns that are not dropped 600 | ), 601 | type_qry AS ( 602 | SELECT 603 | oid AS type_oid, 604 | typname AS base_type_name 605 | FROM pg_type 606 | ), 607 | pk_table_column_qry AS ( 608 | SELECT 609 | conrelid AS table_oid, 610 | unnest(conkey) AS column_ordinal_position, 611 | 1 AS column_pk_ind, 612 | conname AS column_pk_name 613 | FROM 614 | pg_constraint 615 | WHERE 616 | contype = 'p' 617 | ), 618 | fk_table_column_qry AS ( 619 | SELECT DISTINCT -- Distinct one column could have multiple FKs. 620 | conrelid AS table_oid, 621 | unnest(conkey) AS column_ordinal_position, 622 | 1 AS column_fk_ind 623 | FROM 624 | pg_constraint 625 | WHERE 626 | contype = 'f' 627 | ), 628 | source_objects_prep AS ( 629 | SELECT 630 | schema_qry.schema_oid, 631 | schema_qry.schema_name, 632 | schema_qry.schema_description, 633 | table_qry.table_oid, 634 | table_qry.table_name, 635 | COALESCE(table_qry.table_description, 'NA') AS table_description, 636 | column_qry.column_ordinal_position, 637 | column_qry.column_name, 638 | type_qry.base_type_name AS column_base_type_name, 639 | column_qry.column_modification_number, 640 | column_qry.column_type_name, 641 | COALESCE(column_qry.column_description, 'NA') AS column_description, 642 | COALESCE(pk_table_column_qry.column_pk_ind, 0) AS column_pk_ind, 643 | COALESCE(pk_table_column_qry.column_pk_name, 'NA') AS column_pk_name, 644 | COALESCE(fk_table_column_qry.column_fk_ind, 0) AS column_fk_ind 645 | FROM schema_qry 646 | LEFT JOIN table_qry ON schema_qry.schema_oid = table_qry.table_schema_oid 647 | LEFT JOIN column_qry ON table_qry.table_oid = column_qry.column_table_oid 648 | LEFT JOIN type_qry ON column_qry.column_type_oid = type_qry.type_oid 649 | LEFT JOIN pk_table_column_qry ON 650 | table_qry.table_oid = pk_table_column_qry.table_oid AND 651 | column_qry.column_ordinal_position = pk_table_column_qry.column_ordinal_position 652 | LEFT JOIN fk_table_column_qry ON 653 | table_qry.table_oid = fk_table_column_qry.table_oid AND 654 | column_qry.column_ordinal_position = fk_table_column_qry.column_ordinal_position 655 | ) 656 | SELECT 657 | system_qry.id::BIGINT AS system_id, 658 | source_objects_prep.schema_oid::OID as schema_oid, 659 | source_objects_prep.schema_name::TEXT AS schema_name, 660 | source_objects_prep.table_name::TEXT AS table_name, 661 | source_objects_prep.table_oid::OID as table_oid, 662 | source_objects_prep.column_name::TEXT AS column_name, 663 | source_objects_prep.column_ordinal_position::SMALLINT AS column_ordinal_position, 664 | source_objects_prep.column_type_name::TEXT AS column_type_name 665 | FROM source_objects_prep, system_qry 666 | WHERE 667 | schema_name = '{}' AND 668 | table_name = '{}' AND 669 | column_name = '{}' 670 | ; 671 | "#, schema_name, table_name, column_name) 672 | } -------------------------------------------------------------------------------- /extension/src/model/source_objects.rs: -------------------------------------------------------------------------------- 1 | use pgrx::{Json as JsonValue, pg_sys::Oid}; 2 | use serde::{Deserialize, Deserializer, Serialize}; 3 | 4 | #[derive(Debug)] 5 | pub struct SourceTablePrompt { 6 | #[allow(dead_code)] 7 | pub key: Oid, 8 | pub table_column_links: JsonValue, // For linking columns to foreign keys 9 | pub table_details: JsonValue, 10 | } 11 | 12 | #[derive(Debug, Serialize, Deserialize, Clone)] 13 | pub struct SourceTableDetail { 14 | #[serde(rename = "Schema Name")] 15 | pub schema_name: String, 16 | 17 | #[serde(rename = "Table Name")] 18 | pub table_name: String, 19 | 20 | #[serde(rename = "Column Details")] 21 | pub column_details: Vec, 22 | } 23 | 24 | #[derive(Debug, Serialize, Deserialize)] 25 | pub struct Response { 26 | #[serde(rename = "Table ID")] 27 | pub table_id: u32, 28 | #[serde(rename = "Generation")] 29 | pub generation: GenerationTableDetail, 30 | } 31 | 32 | #[derive(Debug, Serialize, Deserialize, Clone)] 33 | pub struct GenerationColumnDetail { 34 | #[serde(rename = "Category")] 35 | pub category: String, 36 | #[serde(rename = "Business Key Name", deserialize_with = "replace_spaces_with_underscores")] 37 | pub business_key_name: String, 38 | #[serde(rename = "Column No")] 39 | pub column_no: i32, 40 | #[serde(rename = "Confidence")] 41 | pub confidence: f64, 42 | #[serde(rename = "Reason")] 43 | pub reason: String, 44 | } 45 | 46 | #[derive(Debug, Serialize, Deserialize, Clone)] 47 | pub struct GenerationTableDetail { 48 | #[serde(rename = "Schema Name")] 49 | pub schema_name: String, 50 | #[serde(rename = "Table Name")] 51 | pub table_name: String, 52 | #[serde(rename = "Column Details")] 53 | pub response_column_details: Vec, 54 | } 55 | 56 | #[derive(Debug, Serialize, Deserialize)] 57 | pub struct ColumnLink { 58 | #[serde(rename = "Column Ordinal Position")] 59 | pub column_ordinal_position: i32, 60 | #[serde(rename = "PK Source Objects")] 61 | pub pk_source_objects: i32, 62 | } 63 | 64 | #[derive(Debug, Serialize, Deserialize)] 65 | pub struct TableLinks { 66 | #[serde(rename = "Column Links")] 67 | pub column_links: Vec, 68 | } 69 | 70 | impl TableLinks { 71 | // Method to find the pk_source_objects based on column_ordinal_position 72 | pub fn find_pk_source_objects(&self, search_position: i32) -> Option { 73 | for link in &self.column_links { 74 | if link.column_ordinal_position == search_position { 75 | return Some(link.pk_source_objects); 76 | } 77 | } 78 | None 79 | } 80 | } 81 | 82 | fn replace_spaces_with_underscores<'de, D>(deserializer: D) -> Result 83 | where 84 | D: Deserializer<'de>, 85 | { 86 | let s = String::deserialize(deserializer)?; 87 | Ok(s.replace(' ', "_")) 88 | } -------------------------------------------------------------------------------- /extension/src/utility/guc.rs: -------------------------------------------------------------------------------- 1 | use pgrx::guc::*; 2 | use std::ffi::CStr; 3 | 4 | // Default not set due to security boundaries associated with extension install. 5 | // The background process has no way to determine which database the extension is installed in. 6 | // When the extension is being created, the database name can only be saved at the session level into the GUC. 7 | pub static PG_AUTO_DW_DATABASE_NAME: GucSetting> = GucSetting::>::new(None); 8 | 9 | // Default not set, as this will make direct changes to the database 10 | pub static PG_AUTO_DW_DW_SCHEMA: GucSetting> = GucSetting::>::new(None); 11 | 12 | // Default set to Ollama 13 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_TYPE: GucSetting> = GucSetting::>::new(Some(unsafe { 14 | CStr::from_bytes_with_nul_unchecked(b"ollama\0") 15 | })); 16 | 17 | // Default Transformer Server URL 18 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_URL: GucSetting> = GucSetting::>::new(Some(unsafe { 19 | CStr::from_bytes_with_nul_unchecked(b"http://localhost:11434/api/generate\0") 20 | })); 21 | 22 | // Default not set 23 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_TOKEN: GucSetting> = GucSetting::>::new(None); 24 | 25 | // Default model is "mistral" 26 | pub static PG_AUTO_DW_MODEL: GucSetting> = GucSetting::>::new(Some(unsafe { 27 | CStr::from_bytes_with_nul_unchecked(b"mistral\0") 28 | })); 29 | 30 | // The accepted transformer's, self-described, confidence level - default 0.8. 31 | pub static PG_AUTO_DW_ACCEPTED_TRANSFORMER_CONFIDENCE_LEVEL: GucSetting = GucSetting::::new(0.8); 32 | 33 | // Number of times the transformer can be given the same request if a failure is recognized - default 3. 34 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_MAX_RETRIES: GucSetting = GucSetting::::new(3); 35 | 36 | // Number of seconds to wait for the transformers response - default 60 sec. 37 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_WAIT_DURATION: GucSetting = GucSetting::::new(60); 38 | 39 | pub fn init_guc() { 40 | // Register the GUCs 41 | GucRegistry::define_string_guc( 42 | "pg_auto_dw.database_name", 43 | "Database name for the pg_auto_dw extension.", 44 | "Specifies the name of the database where the pg_auto_dw extension will be utilized.", 45 | &PG_AUTO_DW_DATABASE_NAME, 46 | GucContext::Suset, 47 | GucFlags::default(), 48 | ); 49 | 50 | GucRegistry::define_string_guc( 51 | "pg_auto_dw.dw_schema", 52 | "Data warehouse schema for the pg_auto_dw extension.", 53 | "Specifies the name of the schema within the database where the pg_auto_dw extension will automatically create and store data warehouse components.", 54 | &PG_AUTO_DW_DW_SCHEMA, 55 | GucContext::Suset, 56 | GucFlags::default(), 57 | ); 58 | 59 | GucRegistry::define_string_guc( 60 | "pg_auto_dw.transformer_server_type", 61 | "Transformer server type for the pg_auto_dw extension.", 62 | "Specifies the server type used by the pg_auto_dw extension. Current available server types include, ollama and openai.", 63 | &PG_AUTO_DW_TRANSFORMER_SERVER_TYPE, 64 | GucContext::Suset, 65 | GucFlags::default(), 66 | ); 67 | 68 | GucRegistry::define_string_guc( 69 | "pg_auto_dw.transformer_server_url", 70 | "Transformer URL for the pg_auto_dw extension.", 71 | "Specifies the URL for the transformer service used by the pg_auto_dw extension.", 72 | &PG_AUTO_DW_TRANSFORMER_SERVER_URL, 73 | GucContext::Suset, 74 | GucFlags::default(), 75 | ); 76 | 77 | GucRegistry::define_string_guc( 78 | "pg_auto_dw.transformer_server_token", 79 | "Bearer token for authenticating API calls to the Transformer Server for the pg_auto_dw extension.", 80 | "The Bearer token is required for authenticating API calls to the Transformer Server when interacting with the pg_auto_dw extension.", 81 | &PG_AUTO_DW_TRANSFORMER_SERVER_TOKEN, 82 | GucContext::Suset, 83 | GucFlags::default(), 84 | ); 85 | 86 | GucRegistry::define_string_guc( 87 | "pg_auto_dw.model", 88 | "Transformer model for the pg_auto_dw extension.", 89 | "Specifies the transformer model to be used by the pg_auto_dw extension for data processing or analysis.", 90 | &PG_AUTO_DW_MODEL, 91 | GucContext::Suset, 92 | GucFlags::default(), 93 | ); 94 | 95 | GucRegistry::define_float_guc( 96 | "pg_auto_dw.accepted_transformer_confidence_level", 97 | "Transformer generated confidence level for the pg_auto_dw extension.", 98 | "Specifies the confidence level threshold generated by the transformer model for the operations performed by the pg_auto_dw extension.", 99 | &PG_AUTO_DW_ACCEPTED_TRANSFORMER_CONFIDENCE_LEVEL, 100 | 0.0, // min value 101 | 1.0, // max value 102 | GucContext::Suset, 103 | GucFlags::default(), 104 | ); 105 | 106 | GucRegistry::define_int_guc( 107 | "pg_auto_dw.transformer_server_max_retries", 108 | "Maximum Transformer Retries", 109 | "Specifies the number of retry attempts the pg_auto_dw extension can make for a transformer request in case of failure.", 110 | &PG_AUTO_DW_TRANSFORMER_SERVER_MAX_RETRIES, 111 | 1, // min value 112 | 10, // max value 113 | GucContext::Suset, 114 | GucFlags::default(), 115 | ); 116 | 117 | GucRegistry::define_int_guc( 118 | "pg_auto_dw.transformer_server_wait_duration", 119 | "Maximum Transformer Server Wait Time", 120 | "Specifies the maximum number of seconds the pg_auto_dw extension will wait for a response from the transformer server.", 121 | &PG_AUTO_DW_TRANSFORMER_SERVER_WAIT_DURATION, 122 | 1, // min value 123 | 360, // max value 124 | GucContext::Suset, 125 | GucFlags::default(), 126 | ); 127 | 128 | } 129 | 130 | // For handling of GUCs that can be error prone 131 | #[derive(Clone, Debug)] 132 | pub enum PgAutoDWGuc { 133 | DatabaseName, 134 | DwSchema, 135 | TransformerServerType, 136 | TransformerServerUrl, 137 | TransformerServerToken, 138 | TransformerServerWaitDuration, 139 | TransformerServerMaxRetries, 140 | Model, 141 | AcceptedTransformerConfidenceLevel, 142 | } 143 | 144 | // A convenience function to get this project's GUCs 145 | pub fn get_guc(guc: PgAutoDWGuc) -> Option { 146 | match guc { 147 | PgAutoDWGuc::DatabaseName => cstr_option_to_string(PG_AUTO_DW_DATABASE_NAME.get()), 148 | PgAutoDWGuc::DwSchema => cstr_option_to_string(PG_AUTO_DW_DW_SCHEMA.get()), 149 | PgAutoDWGuc::TransformerServerType => cstr_option_to_string(PG_AUTO_DW_TRANSFORMER_SERVER_TYPE.get()), 150 | PgAutoDWGuc::TransformerServerUrl => cstr_option_to_string(PG_AUTO_DW_TRANSFORMER_SERVER_URL.get()), 151 | PgAutoDWGuc::TransformerServerToken => cstr_option_to_string(PG_AUTO_DW_TRANSFORMER_SERVER_TOKEN.get()), 152 | PgAutoDWGuc::TransformerServerWaitDuration => cstr_from_int(PG_AUTO_DW_TRANSFORMER_SERVER_WAIT_DURATION.get()), 153 | PgAutoDWGuc::TransformerServerMaxRetries => cstr_from_int(PG_AUTO_DW_TRANSFORMER_SERVER_MAX_RETRIES.get()), 154 | PgAutoDWGuc::Model => cstr_option_to_string(PG_AUTO_DW_MODEL.get()), 155 | PgAutoDWGuc::AcceptedTransformerConfidenceLevel => cstr_from_float(PG_AUTO_DW_ACCEPTED_TRANSFORMER_CONFIDENCE_LEVEL.get()), 156 | } 157 | } 158 | 159 | fn cstr_option_to_string(cstr_o: Option<&CStr>) -> Option { 160 | cstr_o 161 | .and_then(|cstr| cstr.to_str().ok().map(|s| s.to_owned())) 162 | } 163 | 164 | fn cstr_from_float(val: f64) -> Option { 165 | Some(val.to_string()) 166 | } 167 | 168 | fn cstr_from_int(val: i32) -> Option { 169 | Some(val.to_string()) 170 | } 171 | 172 | -------------------------------------------------------------------------------- /extension/src/utility/mod.rs: -------------------------------------------------------------------------------- 1 | pub mod transformer_client; 2 | mod ollama_client; 3 | mod openai_client; 4 | pub mod setup; 5 | pub mod guc; -------------------------------------------------------------------------------- /extension/src/utility/ollama_client.rs: -------------------------------------------------------------------------------- 1 | use reqwest::ClientBuilder; 2 | use serde::{Deserialize, Serialize}; 3 | use std::time::Duration; 4 | 5 | use crate::utility::guc; 6 | use crate::model::prompt_template::PromptTemplate; 7 | 8 | #[derive(Serialize, Debug)] 9 | pub struct GenerateRequest { 10 | pub model: String, 11 | pub prompt: String, 12 | pub format: String, 13 | pub stream: bool, 14 | pub options: Options, 15 | } 16 | 17 | #[derive(Serialize, Debug)] 18 | pub struct Options { 19 | pub temperature: f64, 20 | } 21 | 22 | #[derive(Deserialize, Debug)] 23 | #[allow(dead_code)] 24 | pub struct GenerateResponse { 25 | pub model: String, 26 | pub created_at: String, 27 | pub response: String, 28 | pub done: bool, 29 | } 30 | 31 | pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str, timout_in_sec: u64) -> Result> { 32 | 33 | let client = ClientBuilder::new().timeout(Duration::from_secs(timout_in_sec)).build()?; // 30 sec Default to short for some LLMS. 34 | 35 | let prompt_template = template_type.template(); 36 | 37 | // Inject new_json into the prompt_template' 38 | let column_number = col.to_string(); 39 | let prompt = prompt_template 40 | .replace("{new_json}", new_json) 41 | .replace("{column_no}", &column_number) 42 | .replace("{hints}", &hints); 43 | 44 | // GUC Values for the transformer server 45 | let transformer_server_url = guc::get_guc(guc::PgAutoDWGuc::TransformerServerUrl).ok_or("GUC: Transformer Server URL is not set")?; 46 | let model = guc::get_guc(guc::PgAutoDWGuc::Model).ok_or("MODEL GUC is not set.")?; 47 | 48 | let temperature: f64 = 0.75; 49 | 50 | let options: Options = Options{ 51 | temperature, 52 | }; 53 | 54 | let request = GenerateRequest { 55 | model, 56 | prompt, 57 | format: "json".to_string(), 58 | stream: false, 59 | options, 60 | }; 61 | 62 | let response = client 63 | .post(&transformer_server_url) 64 | .json(&request) 65 | .send() 66 | .await? 67 | .json::() 68 | .await?; 69 | 70 | // Deserialize 71 | let response_json: serde_json::Value = serde_json::from_str(&response.response)?; 72 | 73 | Ok(response_json) 74 | } 75 | -------------------------------------------------------------------------------- /extension/src/utility/openai_client.rs: -------------------------------------------------------------------------------- 1 | use pgrx::prelude::*; 2 | 3 | use reqwest::ClientBuilder; 4 | use serde::{Deserialize, Serialize}; 5 | use std::time::Duration; 6 | 7 | use crate::utility::guc; 8 | use crate::model::prompt_template::PromptTemplate; 9 | 10 | #[derive(Serialize, Debug)] 11 | pub struct Request { 12 | pub model: String, // Model name for OpenAI 13 | pub messages: Vec, // List of messages for chat format 14 | pub temperature: f64, // Temperature setting 15 | pub response_format: ResponseFormat, // JSON-only response format field 16 | } 17 | 18 | #[derive(Serialize, Deserialize, Debug)] 19 | pub struct Message { 20 | pub role: String, // "user", "assistant", or "system" 21 | pub content: String, // The actual prompt or message content 22 | } 23 | 24 | #[derive(Serialize, Debug)] 25 | pub struct ResponseFormat { 26 | #[serde(rename = "type")] 27 | pub r#type: String, // To ensure JSON response format 28 | } 29 | 30 | #[derive(Serialize, Deserialize, Debug)] 31 | pub struct Response { 32 | pub id: String, // Unique identifier for the chat session 33 | pub object: String, // Object type, usually "chat.completion" 34 | pub created: u64, // Timestamp when the response was created 35 | pub model: String, // Model name used for the response 36 | pub choices: Vec, // List of choices (contains the actual answer) 37 | pub usage: Usage, // Information about token usage 38 | } 39 | 40 | #[derive(Serialize, Deserialize, Debug)] 41 | pub struct Choice { 42 | pub message: Message, // Contains the assistant's message 43 | pub finish_reason: Option, // Reason for stopping (e.g., "stop") 44 | pub index: usize, // Index of the choice 45 | pub logprobs: Option,// Log probabilities (if applicable) 46 | } 47 | 48 | #[derive(Serialize, Deserialize, Debug)] 49 | pub struct Usage { 50 | pub prompt_tokens: u32, // Number of tokens in the prompt 51 | pub completion_tokens: u32, // Number of tokens in the completion 52 | pub total_tokens: u32, // Total number of tokens used 53 | } 54 | 55 | pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str, timout_in_sec: u64) -> Result> { 56 | 57 | let client = ClientBuilder::new().timeout(Duration::from_secs(timout_in_sec)).build()?; // 30 sec Default to short for some LLMS. 58 | 59 | let prompt_template = template_type.template(); 60 | // let prompt_template = PromptTemplate::Test.template(); 61 | 62 | // Inject new_json into the prompt_template' 63 | let column_number = col.to_string(); 64 | let prompt = prompt_template 65 | .replace("{new_json}", new_json) 66 | .replace("{column_no}", &column_number) 67 | .replace("{hints}", &hints); 68 | 69 | // GUC Values for the transformer server 70 | let transformer_server_url = guc::get_guc(guc::PgAutoDWGuc::TransformerServerUrl).ok_or("GUC: Transformer Server URL is not set.")?; 71 | let transformer_server_token = guc::get_guc(guc::PgAutoDWGuc::TransformerServerToken).ok_or("GUC: Transformer Server Token is not set.")?; 72 | 73 | let model = guc::get_guc(guc::PgAutoDWGuc::Model).ok_or("MODEL GUC is not set.")?; 74 | 75 | let json_type = String::from("json_object"); 76 | let response_format = ResponseFormat { r#type: json_type,}; 77 | 78 | let temperature: f64 = 0.75; 79 | 80 | let role = String::from("user"); 81 | 82 | let message = Message { 83 | role, 84 | content: prompt, 85 | }; 86 | 87 | let messages = vec![message]; 88 | 89 | let request = Request { 90 | model, 91 | messages, 92 | temperature, 93 | response_format, 94 | }; 95 | 96 | log!("Request: {:#?}", request.messages[0]); 97 | 98 | let raw_response = client 99 | .post(&transformer_server_url) 100 | .header("Authorization", format!("Bearer {}", transformer_server_token)) 101 | .header("Content-Type", "application/json") 102 | .json(&request) 103 | .send() 104 | .await?; 105 | 106 | // Clone the body into a string for logging before deserializing 107 | let raw_body = raw_response.text().await?; 108 | log!("Raw response body: {}", raw_body); 109 | 110 | // Attempt to deserialize the response from the raw body 111 | let response: Response = match serde_json::from_str(&raw_body) { 112 | Ok(parsed_response) => parsed_response, 113 | Err(e) => { 114 | log!("Error parsing response as type `Response`: {}", e); 115 | return Err(Box::new(e)); 116 | } 117 | }; 118 | 119 | // Extract the content string 120 | let content_str = &response 121 | .choices 122 | .get(0) 123 | .ok_or("No choices in response")? 124 | .message 125 | .content; 126 | 127 | log!("Response: {:#?}", content_str); 128 | 129 | // Parse the content string into serde_json::Value 130 | let content_json: serde_json::Value = serde_json::from_str(content_str)?; 131 | 132 | Ok(content_json) 133 | } 134 | 135 | 136 | 137 | 138 | -------------------------------------------------------------------------------- /extension/src/utility/setup.rs: -------------------------------------------------------------------------------- 1 | use pgrx::prelude::*; 2 | 3 | extension_sql_file!("sql/info_tables.sql"); 4 | -------------------------------------------------------------------------------- /extension/src/utility/sql/info_tables.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS source_objects; 2 | 3 | CREATE TABLE IF NOT EXISTS source_objects 4 | ( 5 | pk_source_objects bigserial PRIMARY KEY, 6 | schema_oid oid, 7 | schema_name name, 8 | schema_description text, 9 | table_oid oid, 10 | table_name name, 11 | table_description text, 12 | column_ordinal_position smallint, 13 | column_name name, 14 | column_base_type_name name, 15 | column_modification_number integer, 16 | column_type_name text, 17 | column_description text, 18 | column_pk_ind INT DEFAULT 0, 19 | column_pk_name name, 20 | column_fk_ind INT DEFAULT 0, 21 | column_dw_flag CHAR(1) DEFAULT 'N', 22 | valid_from timestamp without time zone DEFAULT (now() AT TIME ZONE 'UTC'), -- Default to current GMT timestamp 23 | valid_to timestamp without time zone, -- End of validity period 24 | current_flag CHAR(1) DEFAULT 'Y', -- Indicator of current record 25 | deleted_flag CHAR(1) DEFAULT 'N' 26 | ); 27 | 28 | DROP TABLE IF EXISTS dw_source_objects; 29 | 30 | CREATE TABLE IF NOT EXISTS dw_source_objects 31 | ( 32 | pk_dw_source_objects BIGSERIAL PRIMARY KEY, 33 | table_oid OID, 34 | column_ordinal_position SMALLINT 35 | ); 36 | 37 | DROP TABLE IF EXISTS auto_dw.transformer_responses; 38 | 39 | CREATE TABLE IF NOT EXISTS transformer_responses 40 | ( 41 | pk_transformer_responses BIGSERIAL PRIMARY KEY, 42 | fk_source_objects BIGINT, 43 | model_name TEXT, 44 | category TEXT, 45 | business_key_name TEXT, 46 | confidence_score NUMERIC(3, 2), 47 | reason TEXT, 48 | created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT (now() AT TIME ZONE 'UTC'), 49 | CONSTRAINT fk_source_objects FOREIGN KEY (fk_source_objects) 50 | REFERENCES source_objects(pk_source_objects) 51 | ON DELETE CASCADE 52 | ); 53 | 54 | DROP TABLE IF EXISTS build_call; 55 | 56 | CREATE TABLE IF NOT EXISTS build_call 57 | ( 58 | pk_build_call BIGSERIAL PRIMARY KEY, 59 | fk_transformer_responses BIGINT, 60 | build_id VARCHAR(100), 61 | build_flag VARCHAR(100), 62 | build_status VARCHAR(100), 63 | created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT (now() AT TIME ZONE 'UTC'), 64 | CONSTRAINT fk_transformer_responses FOREIGN KEY (fk_transformer_responses) 65 | REFERENCES auto_dw.transformer_responses(pk_transformer_responses) 66 | ON DELETE CASCADE 67 | ); 68 | 69 | DROP TABLE IF EXISTS dv_repo; 70 | 71 | CREATE TABLE dv_repo ( 72 | build_id TEXT, 73 | insert_time TIMESTAMP WITHOUT TIME ZONE DEFAULT (now() AT TIME ZONE 'UTC'), 74 | schema JSON 75 | ); 76 | 77 | DROP TABLE IF EXISTS log; 78 | 79 | CREATE TABLE log ( 80 | pk_log BIGSERIAL PRIMARY KEY, 81 | log_ts TIMESTAMP WITHOUT TIME ZONE DEFAULT (NOW() AT TIME ZONE 'UTC'), 82 | process VARCHAR(50), 83 | level VARCHAR(50), 84 | message TEXT 85 | ); 86 | -------------------------------------------------------------------------------- /extension/src/utility/transformer_client.rs: -------------------------------------------------------------------------------- 1 | use crate::model::prompt_template::PromptTemplate; 2 | use super::{guc, openai_client, ollama_client}; 3 | use TransformerServerType::{OpenAI, Ollama}; 4 | use std::str::FromStr; 5 | 6 | pub enum TransformerServerType { 7 | OpenAI, 8 | Ollama 9 | } 10 | 11 | impl FromStr for TransformerServerType { 12 | type Err = &'static str; 13 | 14 | fn from_str(s: &str) -> Result { 15 | match s.to_lowercase().as_str() { 16 | "openai" => Ok(OpenAI), 17 | "ollama" => Ok(Ollama), 18 | _ => Err("Invalid Transformer Server Type"), 19 | } 20 | } 21 | } 22 | 23 | pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str) -> Result> { 24 | 25 | let transformer_server_type_str = guc::get_guc(guc::PgAutoDWGuc::TransformerServerType).ok_or("GUC: Transformer Server Type is not set.")?; 26 | 27 | let transformer_server_wait_duration = guc::get_guc(guc::PgAutoDWGuc::TransformerServerWaitDuration).ok_or("GUC: Transformer Server Wait Duration is not set.")?; 28 | let timout_in_sec: u64 = transformer_server_wait_duration.parse().expect("TransformerServerMaxRetries Not Valid u64"); 29 | 30 | let transformer_server_type = transformer_server_type_str.parse::() 31 | .map_err(|e| format!("Error parsing Transformer Server Type: {}", e))?; 32 | 33 | match transformer_server_type { 34 | OpenAI => openai_client::send_request(new_json, template_type, col, hints, timout_in_sec).await, 35 | Ollama => ollama_client::send_request(new_json, template_type, col, hints, timout_in_sec).await, 36 | } 37 | } 38 | 39 | --------------------------------------------------------------------------------