├── .github
    └── workflows
    │   └── extension_release.yml
├── .gitignore
├── CONTRIBUTING.md
├── LICENSE
├── README.md
└── extension
    ├── .cargo
        └── config.toml
    ├── .gitignore
    ├── Cargo.toml
    ├── Trunk.toml
    ├── assets
        └── PG_AUTO_DW_LOGO.png
    ├── docs
        ├── readme.md
        └── sql_functions
        │   ├── go.md
        │   ├── health.md
        │   ├── readme.md
        │   ├── source_columns.md
        │   ├── source_exclude.md
        │   ├── source_include.md
        │   ├── source_tables.md
        │   └── update_context.md
    ├── pg_auto_dw.control
    └── src
        ├── bin
            └── pgrx_embed_pg_auto_dw.rs
        ├── controller
            ├── bgw_init.rs
            ├── bgw_source_objects.rs
            ├── bgw_transformer_client.rs
            ├── dv_builder.rs
            ├── dv_loader.rs
            └── mod.rs
        ├── lib.rs
        ├── model
            ├── dv_schema.rs
            ├── mod.rs
            ├── prompt_template.rs
            ├── queries.rs
            └── source_objects.rs
        └── utility
            ├── guc.rs
            ├── mod.rs
            ├── ollama_client.rs
            ├── openai_client.rs
            ├── setup.rs
            ├── sql
                └── info_tables.sql
            └── transformer_client.rs


/.github/workflows/extension_release.yml:
--------------------------------------------------------------------------------
 1 | name: Release pg_auto_dw
 2 | 
 3 | defaults:
 4 |   run:
 5 |     shell: bash
 6 |     working-directory: ./extension
 7 | 
 8 | on:
 9 |   pull_request:
10 |     branches:
11 |       - main
12 |     paths-ignore:
13 |       - "README.md"
14 |   push:
15 |     branches:
16 |       - main
17 |     paths-ignore:
18 |       - "README.md"
19 |   release:
20 |     types:
21 |       - created
22 | jobs:
23 |   publish:
24 |     if: github.event_name == 'release'
25 |     name: trunk publish
26 |     runs-on: ubuntu-latest
27 |     strategy:
28 |       matrix:
29 |         pg-version: [14, 15, 16, 17]
30 |     steps:
31 |       - uses: actions/checkout@v2
32 |       - name: Install Rust stable toolchain
33 |         uses: actions-rs/toolchain@v1
34 |         with:
35 |           toolchain: stable
36 |       - name: Install stoml and pg-trunk
37 |         shell: bash
38 |         run: |
39 |           set -xe
40 |           wget https://github.com/freshautomations/stoml/releases/download/v0.7.1/stoml_linux_amd64 &> /dev/null
41 |           mv stoml_linux_amd64 stoml
42 |           chmod +x stoml
43 |           sudo mv stoml /usr/local/bin/
44 |           cargo install pg-trunk
45 |       - name: trunk build
46 |         working-directory: ./extension
47 |         run: |
48 |           ~/.cargo/bin/trunk build --pg-version ${{ matrix.pg-version }}
49 |       - name: trunk publish
50 |         working-directory: ./extension
51 |         env:
52 |           TRUNK_API_TOKEN: ${{ secrets.TRUNK_AUTH_TOKEN }}
53 |         run: ~/.cargo/bin/trunk publish
54 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | .DS_Store
2 | .idea/
3 | /target
4 | *.iml
5 | **/*.rs.bk
6 | Cargo.lock
7 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to `pg_auto_dw`
 2 | 
 3 | ## Releases
 4 | 
 5 | `pg_auto_dw` follows [semantic versioning](semver.org) and is released to [pgt.dev](https://pgt.dev/extensions/pg_auto_dw).
 6 | 
 7 | To release, follow these steps:
 8 | 
 9 | 1. Create a PR updating the version in `Cargo.toml` and `Trunk.toml`. These two values must agree.
10 | 2. Merge the PR into the `main` branch.
11 | 3. [Create the release](https://github.com/tembo-io/pg_auto_dw/releases/new)
12 |    1. Use the tag format `vX.Y.Z` where `X.Y.Z` is the version number. e.g. `v0.1.0`. This version should be the same value as in `Cargo.toml` and `Trunk.toml`.
13 |    2. Click "Generated release notes" to auto-populate the release notes or fill in with your own content and notes.
14 |    3. Click "Publish release"
15 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | The PostgreSQL License
2 | 
3 | Copyright (c) 2024, Tembo
4 | 
5 | Permission to use, copy, modify, and distribute this software and its documentation for any purpose, without fee, and without a written agreement is hereby granted, provided that the above copyright notice and this paragraph and the following two paragraphs appear in all copies.
6 | 
7 | IN NO EVENT SHALL TEMBO BE LIABLE TO ANY PARTY FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, EVEN IF TEMBO HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
8 | 
9 | TEMBO SPECIFICALLY DISCLAIMS ANY WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND TEMBO HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR MODIFICATIONS.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # pg_auto_dw
  2 | 
  3 | <img src="https://tembo.io/_astro/graphs.CNZLRuSs_Z1YDvaO.webp" style="border-radius: 30px; width: 600px; height: auto;">
  4 | 
  5 | [![Static Badge](https://img.shields.io/badge/%40tembo-community?logo=slack&label=slack)](https://join.slack.com/t/tembocommunity/shared_invite/zt-277pu7chi-NHtvHWvLhHwyK0Y5Y6vTPw)
  6 | [![OSSRank](https://shields.io/endpoint?url=https://ossrank.com/shield/4020)](https://ossrank.com/p/4020)
  7 | [![Warning: Under Active Development](https://img.shields.io/badge/Warning-Under_Active_Development-yellow)](https://github.com/tembo-io/pg_auto_dw)
  8 | 
  9 | 
 10 | ## Overview
 11 | 
 12 | `pg_auto_dw` is a [permissively-licensed open-source](LICENSE) Postgres Extension that automates the creation of a Postgres-based data warehouse, given one or more transactional Postgres database inputs. 
 13 | 
 14 | We aim to do this within a structured environment that incorporates best practices and harnesses the capabilities of Large Language Models (LLM) technologies.
 15 | 
 16 | We are starting with automation to facilitate a data vault implementation for our data warehouse. This will be a rudimentary raw vault setup, but we hope it will lead to substantial downstream business models.
 17 | 
 18 | 
 19 | ## Goals
 20 | 
 21 | - Automate the DW Build
 22 | - Automate DW Maintenance  
 23 | - Understand DW Health
 24 | - Support Data Governance
 25 | 
 26 | These capabilities will be delivered through a [small set of Postgres functions](extension/docs/sql_functions/readme.md).
 27 | 
 28 | ## Walkthrough
 29 | 
 30 | ### Setup
 31 | 
 32 | 1. Install extension
 33 | 
 34 |     ```SQL
 35 |     DROP EXTENSION IF EXISTS pg_auto_dw CASCADE;
 36 |     CREATE EXTENSION pg_auto_dw;
 37 |     ```
 38 | 
 39 |     > Installing this extension installs a couple source sample tables in the PUBLIC SCHEMA as well as the PG_CRYPTO extension.
 40 | 
 41 | 1. Restart your Postgres instance.
 42 | 
 43 | 1. Create a destination schema 
 44 | 
 45 |     Choose a name for a schema for your data warehouse to be built into.
 46 | 
 47 |     ```SQL
 48 |     DROP SCHEMA IF EXISTS my_dw CASCADE;
 49 |     CREATE SCHEMA my_dw;
 50 |     ```
 51 | 
 52 | 1. Reload configuration
 53 | 
 54 |     ```SQL
 55 |     SELECT pg_reload_conf();
 56 |     ```
 57 | 
 58 | 1. Confirm setup
 59 | 
 60 |     ```SQL
 61 |     SHOW pg_auto_dw.database_name;
 62 |     SHOW pg_auto_dw.dw_schema;
 63 |     ```
 64 |     This should return `postgres` and the schema name you selected.
 65 | 
 66 | 1. Set your LLM and reload configuration
 67 | 
 68 |     ```SQL
 69 |     ALTER SYSTEM SET pg_auto_dw.model TO 'gpt-4o';
 70 |     ALTER SYSTEM SET pg_auto_dw.transformer_server_type TO 'openai';
 71 |     ALTER SYSTEM SET pg_auto_dw.transformer_server_url TO 'https://api.openai.com/v1/chat/completions';
 72 |     ALTER SYSTEM SET pg_auto_dw.transformer_server_token TO 'xxx';
 73 |     SELECT pg_reload_conf();
 74 |     ```
 75 | 
 76 | ### Load sample data
 77 | 
 78 | ```SQL
 79 | DROP TABLE IF EXISTS public.seller;
 80 | CREATE TABLE public.seller (
 81 |     seller_id UUID PRIMARY KEY,  -- Designating seller_id as the primary key
 82 |     city VARCHAR(255),
 83 |     state CHAR(2),
 84 |     zip_5 VARCHAR(10)
 85 | );
 86 | 
 87 | INSERT INTO public.seller (seller_id, city, state, zip_5) VALUES
 88 | ('9449f25aeaf531019b76999ea49a6949','rio de janeiro','RJ','21040'),
 89 | ('9bc484c87d79cd4874e05ca182658045','sao paulo','SP','02422'),
 90 | ('3442f8959a84dea7ee197c632cb2df15','campinas','SP','13023'),
 91 | ('d149de2f383552baea37a7198c2296ce','sao paulo','SP','04193'),
 92 | ('c747d5b92c7648417faea95d36d763e8','pacatuba','CE','61800'),
 93 | ('455f46ef09a9e45667e2981df84b5cc2','sorocaba','SP','18052'),
 94 | ('8ff38bc3969e67c36c48343a07090f66','sao paulo','SP','08260'),
 95 | ('50bf89f1349bc0409a268c3a49678009','jaci','SP','15155'),
 96 | ('323ce52b5b81df2cd804b017b7f09aa7','sao paulo','SP','03306'),
 97 | ('1284de4ae8aa26997e748c851557cf0e','laranjeiras do sul','SP','85301'),
 98 | ('f80edd2c5aaa505cc4b0a3b219abf4b8','sao paulo','SP','03431');
 99 | 
100 | DROP TABLE IF EXISTS public.orders;
101 | CREATE TABLE public.orders (
102 | 	order_id UUID PRIMARY KEY,
103 |     seller_id UUID,
104 |     order_date timestamp,
105 |     order_amount NUMERIC(10,2)
106 | );
107 | 
108 | INSERT INTO public.orders (order_id, seller_id, order_date, order_amount) VALUES
109 | (gen_random_uuid(), '9449f25aeaf531019b76999ea49a6949', now(), 20.01),
110 | (gen_random_uuid(), '9449f25aeaf531019b76999ea49a6949', now(), 44.01),
111 | (gen_random_uuid(), '9bc484c87d79cd4874e05ca182658045', now(), 99.03);
112 | ```
113 | 
114 | ### Build Data Warehouse
115 | 
116 | 1. Set your sources
117 | 
118 |     ```SQL
119 |     SELECT auto_dw.source_include('public', 'seller');
120 |     SELECT auto_dw.source_include('public', 'orders');
121 |     ```
122 | 
123 |     Postgres Regex is used behind the scenes. To do an exact match, use:
124 | 
125 |     ```SQL
126 |     SELECT auto_dw.source_include('public', '^sellers$');
127 |     ```
128 |     
129 | 1. Confirm the table columns are queued for processing
130 | 
131 |     ```SQL
132 |     SELECT * FROM auto_dw.source_column();
133 |     ```
134 | 
135 |     You should see a list of columns with status `Queued for Processing`.
136 | 
137 | 1. Go
138 | 
139 |     ```SQL
140 |     SELECT auto_dw.go();
141 |     ```
142 | 
143 | ### Accessing your data
144 | 
145 | Here's an example materialized view to pull the data together into a flat view.
146 | 
147 | ```SQL
148 | CREATE MATERIALIZED VIEW my_mat_view AS
149 | SELECT
150 | sat_orders.order_date,
151 | sat_orders.order_amount,
152 | sat_seller.city,
153 | sat_seller.state,
154 | sat_seller.zip_5
155 | FROM dw_dev.link_order_seller
156 | LEFT JOIN dw_dev.sat_orders ON link_order_seller.link_order_seller_hk = sat_orders.link_order_seller_hk
157 | LEFT JOIN dw_dev.hub_seller ON link_order_seller.hub_seller_hk = hub_seller.hub_seller_hk
158 | LEFT JOIN dw_dev.sat_seller ON hub_seller.hub_seller_hk = sat_seller.hub_seller_hk;
159 | ;
160 | ```
161 | 
162 | ### Tips
163 | 
164 | If your field isn't being interpreted correctly, try adding a comment to the field, which the LLM does consider.
165 | 
166 | ```SQL
167 | COMMENT ON COLUMN public.orders.seller_id IS 'is business key';
168 | ```
169 | 
170 | ## Setting up foreign data wrappers
171 | 
172 | The example above reads data from the same instance that it's writing to. Normally you'd want to isolate analytical workloads from transactional workloads.
173 | 
174 | You can use Postgres foreign data wrapper functionality to accomplish this.
175 | 
176 | ```SQL
177 | -- Enable the postgres_fdw extension
178 | CREATE EXTENSION postgres_fdw;
179 | 
180 | -- Inspect existing foreign servers
181 | SELECT * FROM pg_foreign_server; -- Run on the previously configured client system to inspect existing foreign servers.
182 | 
183 | -- Create a new foreign server
184 | CREATE SERVER foreign_server
185 | FOREIGN DATA WRAPPER postgres_fdw
186 | OPTIONS (host 'remote_server_ip', dbname 'foreign_db', port '5432');
187 | 
188 | -- Inspect existing user mappings (if applicable)
189 | SELECT * FROM pg_user_mappings; -- Run on the previously configured client system to view user mappings for foreign servers.
190 | 
191 | -- Create a user mapping for the foreign server
192 | CREATE USER MAPPING FOR local_user
193 | SERVER foreign_server
194 | OPTIONS (user 'foreign_user', password 'password');
195 | 
196 | -- Manually define a foreign table
197 | CREATE FOREIGN TABLE foreign_table_name (
198 |     column1 datatype,   -- Replace with the column name and datatype in the local schema.
199 |     column2 datatype    -- Repeat for additional columns.
200 | )
201 | SERVER foreign_server
202 | OPTIONS (
203 |     schema_name 'public',       -- Schema name of the source table in the foreign server.
204 |     table_name 'source_table'   -- Table name in the foreign server.
205 | );
206 | 
207 | -- Automatically via Schema
208 | -- Use this approach to bulk import tables, minimizing manual effort.
209 | IMPORT FOREIGN SCHEMA public          -- Replace 'public' with the schema name in the foreign server.
210 | FROM SERVER foreign_server            -- Specify the name of the foreign server.
211 | INTO local_schema;                    -- Replace 'local_schema' with the schema name in the client system.
212 | ```
213 | 
214 | ## Advanced Demo: Auto Data Governance
215 | 
216 | Sometimes it’s best to get a little push-back when creating a data warehouse, which supports appropriate data governance.  In this instance a table was not ready to deploy to the data warehouse as a table column may need to be considered sensitive and handled appropriately.  In this sample script, Auto DW’s engine understands the attribute is useful for analysis, but also may need to be considered sensitive.  In this script the user will:
217 | 
218 | 1) **Identify a Skipped Table**
219 | 
220 | ```SQL
221 | /* Identify source tables skipped and not integration into the data warehouse. */
222 | SELECT schema, "table", status, status_response 
223 | FROM auto_dw.source_table()
224 | WHERE status_code = 'SKIP';
225 | ```
226 | 
227 | > **Note:** Running this code will provide an understanding of which table was skipped along with a high level reason.  You should see the following output from the status_response: “Source Table was skipped as column(s) need additional context. Please run the following SQL query for more information: SELECT schema, table, column, status, status_response FROM auto_dw.source_status_detail() WHERE schema = 'public' AND table = 'customers'.”
228 | 
229 | 2) **Identify the Root Cause**
230 | 
231 | ```SQL
232 | /* Identify the source table column that caused the problem, understand the issue, and potential solution. */
233 | SELECT schema, "table", "column", status, confidence_level, status_response
234 | FROM auto_dw.source_column()
235 | WHERE schema = 'PUBLIC' AND "table" = 'CUSTOMER';
236 | ```
237 | 
238 | > **Note:** Running this code will provide an understanding of which table column was skipped along with a reason in the status_response.  You should see the following output: “Requires Attention: Column cannot be appropriately categorized as it may contain sensitive data.  Specifically, if the zip is an extended zip it may be considered PII.”
239 | 
240 | 3) **Decide to Institute Some Data Governance Best Practices**
241 | 
242 | ```SQL
243 | /* Altering column length restricts the acceptance of extended ZIP codes.*/
244 | ALTER TABLE customer ALTER COLUMN zip TYPE VARCHAR(5);
245 | ```
246 | 
247 | > **Note:** Here the choice was up to the user to make a change that facilitated LLM understanding of data sensitivity.  In this case, limiting the type to VARCHAR(5) will allow the LLM to understand that this column will not contain sensitive information in the future. 
248 | 
249 | ```mermaid
250 | flowchart LR
251 |     Start(("Start")) --> tbl["Identify a Skipped Table\nauto_dw.source_table()"]
252 |     tbl --> col["Identify the Root Cause\nauto_dw.source_column()"]
253 |     col --> DW[("Institute Data Governance\nBest Practices")]
254 |     DW --> Done(("Done"))
255 | ```
256 | 
257 | **Auto DW Process Flow:** The script highlighted in Act 2 demonstrates that there are several approaches to successfully implementing a data warehouse when using this extension. Below is a BPMN diagram that illustrates these various paths.
258 | 
259 | ```mermaid
260 | flowchart LR
261 |  subgraph functions_informative["Informative Functions"]
262 |     direction LR
263 |         health["auto_dw.health()"]
264 |         source_tables["auto_dw.source_tables()"]
265 |         source_column["auto_dw.source_column()"]
266 |   end
267 |  subgraph functions_interactive["Interactive Functions"]
268 |     direction LR
269 |         source_clude["auto_dw.source_include(object_pattern)"]
270 |         update_context["auto_dw.update_context(object, context)"]
271 |         go["auto_dw.go(flag, status)"]
272 |   end
273 |  subgraph data_gov["Data Governance"]
274 |     direction BT
275 |         to_gov{"X"} --> gov["Issue\nGovernance"]
276 |   end
277 |     start(("Start")) --> command["Choose Command"]
278 |     command --> split{"X"}
279 |     split --> health & source_tables & source_column & source_clude & update_context & go --> join{"X"}
280 |     join --> review["Review Results"]
281 |     review --> data_gov --> more_auto{"More\nAutomations?"} 
282 |     more_auto --> |no| done(("Done"))
283 |     more_auto --> |yes| start_again(("Restart"))
284 | ```
285 | 
286 | 


--------------------------------------------------------------------------------
/extension/.cargo/config.toml:
--------------------------------------------------------------------------------
1 | [target.'cfg(target_os="macos")']
2 | # Postgres symbols won't be available until runtime
3 | rustflags = ["-Clink-arg=-Wl,-undefined,dynamic_lookup"]
4 | 


--------------------------------------------------------------------------------
/extension/.gitignore:
--------------------------------------------------------------------------------
1 | /target


--------------------------------------------------------------------------------
/extension/Cargo.toml:
--------------------------------------------------------------------------------
 1 | [package]
 2 | name = "pg_auto_dw" # Extension name
 3 | version = "0.0.5" # Extension version (SemVer: MAJOR.MINOR.PATCH)
 4 | edition = "2021" # Rust 2021 edition
 5 | 
 6 | [lib]
 7 | crate-type = ["cdylib", "lib"]
 8 | 
 9 | [features]
10 | default = ["pg16"]
11 | pg14 = ["pgrx/pg14", "pgrx-tests/pg14" ]
12 | pg15 = ["pgrx/pg15", "pgrx-tests/pg15" ]
13 | pg16 = ["pgrx/pg16", "pgrx-tests/pg16" ]
14 | pg17 = ["pgrx/pg17", "pgrx-tests/pg17" ]
15 | pg_test = []
16 | 
17 | # Custom features
18 | experimental = []
19 | 
20 | [dependencies]
21 | pgrx = "0.12.9"
22 | serde = { version = "1.0", features = ["derive"] }
23 | serde_json = "1.0"
24 | reqwest = { version = "0.11", features = ["json"] }
25 | tokio = { version = "1", features = ["full"] }
26 | uuid = { version = "1.1", features = ["v4", "v5", "serde"] }
27 | chrono = { version = "0.4", features = ["serde"] }
28 | anyhow = "1.0"
29 | regex = "1.7"
30 | sha2 = "0.10"
31 | hex = "0.4"
32 | 
33 | [dev-dependencies]
34 | pgrx-tests = "0.12.9"
35 | 
36 | [profile.dev]
37 | panic = "unwind"
38 | 
39 | [profile.release]
40 | panic = "unwind"
41 | opt-level = 3
42 | lto = "fat"
43 | codegen-units = 1
44 | 


--------------------------------------------------------------------------------
/extension/Trunk.toml:
--------------------------------------------------------------------------------
 1 | [extension]
 2 | name = "pg_auto_dw"
 3 | version = "0.0.5"
 4 | repository = "https://github.com/tembo-io/pg_auto_dw"
 5 | license = "PostgreSQL"
 6 | description = "An auto data warehouse extension for Postgres."
 7 | homepage = "https://github.com/tembo-io/pg_auto_dw"
 8 | documentation = "https://github.com/tembo-io/pg_auto_dw"
 9 | categories = ["analytics", "orchestration"]
10 | loadable_libraries = [{ library_name = "pg_auto_dw", requires_restart = true }]
11 | 
12 | [build]
13 | postgres_version = "15"
14 | platform = "linux/amd64"


--------------------------------------------------------------------------------
/extension/assets/PG_AUTO_DW_LOGO.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/tembo-io/pg_auto_dw/631e1946ebf8600f6459ddac16ddde58cfe0c646/extension/assets/PG_AUTO_DW_LOGO.png


--------------------------------------------------------------------------------
/extension/docs/readme.md:
--------------------------------------------------------------------------------
1 | ## Documentation Hub
2 | A Guide to PG_AUTO_DW
3 | - [SQL Functions](sql_functions/readme.md)
4 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/go.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md##interactive-functions)**
 3 | 
 4 | # GO <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | Initiates data warehouse builds and initiates dataflows.
 6 | 
 7 | ## Syntax
 8 | ``` SQL
 9 | go(<flag>, <status>)
10 | ```
11 | 
12 | ## Usage Notes
13 | Use this function to build an entire data warehouse or push data from a single table into the built dw tables.
14 | 
15 | ## Examples
16 | 
17 | Build a Data Warehouse
18 | ```sql
19 | -- Builds a DW for all source tables that are ready-to-deploy.
20 | SELECT auto_dw.go('Build', 'RTD');
21 | ```
22 | &nbsp;&nbsp;&nbsp;&nbsp;or
23 | ```sql
24 | -- Builds a DW for all source tables that are ready-to-deploy.
25 | SELECT auto_dw.go(); -- Runs the default, which builds for all tables that are ready-to-deploy.
26 | ```
27 | <br>
28 | 
29 | Perform a Dry Run
30 | ```sql
31 | -- Build, Test, and Rollback DW automation for all source tables that are ready-to-deploy.
32 | SELECT auto_dw.go('DryRun', 'RTD');
33 | ```
34 | 
35 | <br>
36 | 
37 | Push data from a table.
38 | ```sql
39 | -- Push Source TABLE MARKETING.PROSPECTS data to the DW.
40 | SELECT auto_dw.go('Push-Table', 'marketing.prospects');
41 | ```
42 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/health.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md#informative-functions)**
 3 | 
 4 | # HEALTH <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | 
 6 | Returns a table indicating the health of all DW automations.
 7 | 
 8 | ## Syntax
 9 | ```sql
10 | health()
11 | ```
12 | 
13 | ## Usage Notes
14 | Use this function often to understand the state of your data warehouse.  Results can be used to identify operational errors and data availability.
15 | 
16 | ## Examples
17 | ```sql
18 | SELECT * FROM auto_dw.health();
19 | ```
20 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/readme.md:
--------------------------------------------------------------------------------
 1 | ## SQL Function Documentation 
 2 | ![Status](https://img.shields.io/badge/status-draft-yellow)
 3 | 
 4 | The following SQL functions provide the primary modality for interacting with the extension PG_AUTO_DW. Functions are broken into two categories: informative and interactive. Interactive functions can change the data warehouse (DW).
 5 | 
 6 | ### Informative Functions
 7 | These functions do not affect the database.
 8 | |  Availability | Function | Purpose |
 9 | |--------------|---------------------------------------|-----------------------------------------------------------------------|
10 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`health()`](health.md) | Understand DW health. |
11 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_tables()`](source_tables.md) | Understand the status of all tables included for DW automation. |
12 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_columns()`](source_columns.md)| Understand the status of all table columns included for DW automation. |
13 | 
14 | ### Interactive Functions
15 | These functions can only effect the data warehouse portion of the database.
16 | |  Availability | Function | Purpose |
17 | |--------------|---------------------------------------|-----------------------------------------------------------------------|
18 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_include(object_pattern)`](source_include.md) | Add source objects for DW automation. |
19 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`source_exclude(object_pattern)`](source_exclude.md) | Remove source objects for DW automation. |
20 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`update_context(object, context)`](update_context.md) | Provide information to facilitate DW automation. |
21 | | ![Proposal Version](https://img.shields.io/badge/proposal-0.0.1-blue) | [`go(flag, status)`](go.md) | Initiates DW builds and dataflows. |
22 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/source_columns.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md#informative-functions)**
 3 | 
 4 | # SOURCE_COLUMNS <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | 
 6 | Returns a table indicating the status of all columns included for DW automation.
 7 | 
 8 | ## Syntax
 9 | ```sql
10 | source_columns()
11 | ```
12 | 
13 | ## Usage Notes
14 | Use this function to see the status of source columns in the DW automation process.  Results can be used to identify table columns that require additional attention.
15 | 
16 | ## Examples
17 | ```sql
18 | SELECT * FROM auto_dw.source_columns();
19 | ```
20 | 
21 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/source_exclude.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md##interactive-functions)**
 3 | 
 4 | # SOURCE_EXCLUDE <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | 
 6 | - Removes objects from the DW automation queue.
 7 | - Returns a table indicating objects that have been removed from the DW automation queue. 
 8 | 
 9 | ## Syntax
10 | ``` SQL
11 | source_exclude(<object_pattern>)
12 | ```
13 | 
14 | ## Usage Notes
15 | Use this function to remove SCHEMAS, TABLES, and COLUMNS from the DW automation queue.
16 | 
17 | ## Examples
18 | 
19 | All objects in the PUBLIC SCHEMA have been added by default.  To remove SCHEMA PUBLIC issues the following statement.
20 | ```sql
21 | -- Remove PUBLIC SCHEMA and associated objects from the queue.
22 | SELECT * FROM auto_dw.source_exclude('PUBLIC');
23 | ```
24 | <br>
25 |  
26 | Remove COLUMN from TABLE MARKETING.PROSPECTS
27 | ```sql
28 | -- Remove attribute LAST_REACHED_TS 
29 | SELECT * FROM auto_dw.source_exclude('marketing.prospects.last_reached_ts');
30 | ```
31 | **Note:** If automations warehoused this column, automations will not remove the associated column or data. 
32 | 
33 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/source_include.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md##interactive-functions)**
 3 | 
 4 | # SOURCE_INCLUDE <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | 
 6 | - Adds objects to the DW automation queue.
 7 | - Returns a table indicating objects that have been added to the DW automation queue. 
 8 | 
 9 | ## Syntax
10 | ``` SQL
11 | source_include(<object_pattern>)
12 | ```
13 | 
14 | ## Usage Notes
15 | Use this function to add SCHEMAS, TABLES, and COLUMNS to the DW automation queue.  If new attributes have been added to a table you may add them to the queue with this function.
16 | 
17 | > **Note:** All objects in the PUBLIC schema are added by default upon extension creation.  To remove see example in function source_exclude().
18 | 
19 | ## Examples
20 | 
21 | Add TABLE ERROR_LOGS
22 | ```sql
23 | -- Adds all TABLE ERROR_LOGS COLUMNS to the queue.
24 | SELECT * FROM auto_dw.source_include('logging.error_logs.*');
25 | ```
26 | 
27 | Add SCHEMA MARKETING
28 | ```sql
29 | -- Adds all TABLE and TABLE COLUMNS from SCHEMA MARKETING. 
30 | SELECT * FROM auto_dw.source_include('marketing.*.*');
31 | ```
32 | 
33 | Add new COLUMN from TABLE MARKETING.PROSPECTS
34 | ```sql
35 | -- Add attribute LAST_REACHED_TS 
36 | SELECT * FROM auto_dw.source_include('marketing.prospects.last_reached_ts');
37 | ```
38 | 
39 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/source_tables.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md#informative-functions)**
 3 | 
 4 | # SOURCE_TABLES <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | 
 6 | Returns a table indicating the status of all tables included for DW automation.
 7 | 
 8 | ## Syntax
 9 | ```sql
10 | source_tables()
11 | ```
12 | 
13 | ## Usage Notes
14 | Use this function to see the status of source tables in the DW automation process.  Results can be used to identify tables that require additional attention or to understand the DW build status.
15 | 
16 | ## Examples
17 | ```sql
18 | SELECT * FROM auto_dw.source_tables();
19 | ```
20 | 


--------------------------------------------------------------------------------
/extension/docs/sql_functions/update_context.md:
--------------------------------------------------------------------------------
 1 | ## Categories:
 2 | **[SQL Function - Informative](readme.md##interactive-functions)**
 3 | 
 4 | # UPDATE_CONTEXT <br> ![Status](https://img.shields.io/badge/status-draft-yellow)
 5 | Adds context to objects for DW automation processes.
 6 | 
 7 | ## Syntax
 8 | ``` SQL
 9 | update_context(<object>, <context>)
10 | ```
11 | 
12 | ## Usage Notes
13 | Use this function to add context to SCHEMAS, TABLES, and COLUMNS.
14 | 
15 | ## Examples
16 | 
17 | Adding a 4 AM Daily Schedule to TABLE ERROR_LOGS
18 | ```sql
19 | -- Adds all TABLE ERROR_LOGS COLUMNS to the queue.
20 | SELECT auto_dw.update_context('public.foo', '{"cron": "0 4 * * *"}'
21 | ```
22 | 
23 | <br>
24 | 
25 | Indicate that COLUMN ZIP does not contain sensitive information.
26 | ```sql
27 | SELECT auto_dw.update_context('PUBLIC.CUSTOMER.ZIP', {"sensitive": false});
28 | ```
29 | 
30 | 


--------------------------------------------------------------------------------
/extension/pg_auto_dw.control:
--------------------------------------------------------------------------------
1 | comment = 'Extension to automatically create downstream data warehouse tables.'
2 | default_version = '@CARGO_VERSION@'
3 | module_pathname = '$libdir/pg_auto_dw'
4 | relocatable = false
5 | superuser = true
6 | schema = 'auto_dw'
7 | 


--------------------------------------------------------------------------------
/extension/src/bin/pgrx_embed_pg_auto_dw.rs:
--------------------------------------------------------------------------------
1 | ::pgrx::pgrx_embed!();
2 | 


--------------------------------------------------------------------------------
/extension/src/controller/bgw_init.rs:
--------------------------------------------------------------------------------
 1 | use pgrx::bgworkers::*;
 2 | use pgrx::prelude::*;
 3 | 
 4 | use crate::utility::guc;
 5 | 
 6 | #[pg_guard]
 7 | pub extern "C" fn _PG_init() {
 8 | 
 9 |     guc::init_guc();
10 | 
11 |     let database_name_o = guc::get_guc(guc::PgAutoDWGuc::DatabaseName);
12 | 
13 |     match database_name_o {
14 |         Some(_database_name) => {
15 | 
16 |             BackgroundWorkerBuilder::new("Background Worker Source Object Update")
17 |             .set_function("background_worker_source_objects")
18 |             .set_library("pg_auto_dw")
19 |             .enable_spi_access()
20 |             .load();
21 | 
22 |            BackgroundWorkerBuilder::new("Background Worker Transformer Client")
23 |             .set_function("background_worker_transformer_client")
24 |             .set_library("pg_auto_dw")
25 |             .enable_spi_access()
26 |             .load();
27 |         }
28 |         None => {
29 |             log!("Database Name for this extension has not been set.");
30 |         }
31 |     }
32 | }
33 | 
34 | 
35 | 
36 | 
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/extension/src/controller/bgw_source_objects.rs:
--------------------------------------------------------------------------------
 1 | use pgrx::bgworkers::*;
 2 | use pgrx::prelude::*;
 3 | 
 4 | use std::time::Duration;
 5 | 
 6 | use crate::queries;
 7 | use crate::utility::guc;
 8 | 
 9 | #[pg_guard]
10 | #[no_mangle]
11 | pub extern "C" fn background_worker_source_objects(_arg: pg_sys::Datum) {
12 | 
13 |     let optional_database_name = guc::get_guc(guc::PgAutoDWGuc::DatabaseName);
14 | 
15 |     BackgroundWorker::attach_signal_handlers(SignalWakeFlags::SIGHUP | SignalWakeFlags::SIGTERM);
16 |     BackgroundWorker::connect_worker_to_spi(optional_database_name.as_deref(), None);
17 | 
18 |     while BackgroundWorker::wait_latch(Some(Duration::from_secs(10))) {
19 |         let result: Result<(), pgrx::spi::Error> = BackgroundWorker::transaction(|| {
20 |             Spi::connect(|mut client| {
21 | 
22 |                 let table_check_results: Result<spi::SpiTupleTable, spi::SpiError> = 
23 |                     client.select("SELECT table_name FROM information_schema.tables WHERE table_schema = 'auto_dw' AND table_name = 'source_objects'", None, None);
24 |                 match table_check_results {
25 |                     Ok(table_check) => {
26 |                         if table_check.len() > 0 {
27 |                             client.update(
28 |                                 queries::source_object_dw(
29 |                                     "a^", 
30 |                                     "a^", 
31 |                                     "a^", 
32 |                                     "a^", 
33 |                                     "a^", 
34 |                                     "a^"
35 |                                 ).as_str(),
36 |                                 None,
37 |                                 None,
38 |                             )?;
39 |                         } else {
40 |                             panic!("TABLE AUTO_DW.SOURCE_OBJECTS not found. PG_AUTO_DW Extension may need to be installed.");
41 |                         }
42 |                     },
43 |                     Err(e) => {
44 |                         log!("Error checking TABLE AUTO_DW.SOURCE_OJBECTS: {:?}", e);
45 |                     }
46 |                 }
47 |                 Ok(())
48 |             })
49 |         });
50 |         result.unwrap_or_else(|e| panic!("got an error: {}", e));
51 |     }
52 | }


--------------------------------------------------------------------------------
/extension/src/controller/bgw_transformer_client.rs:
--------------------------------------------------------------------------------
  1 | use pgrx::bgworkers::*;
  2 | use pgrx::{prelude::*, pg_sys::Oid};
  3 | 
  4 | use std::time::Duration;
  5 | use std::collections::HashMap;
  6 | use tokio::runtime::Runtime;
  7 | use tokio::time::sleep;
  8 | use serde::Deserialize;
  9 | 
 10 | use crate::model::*;
 11 | use crate::utility::transformer_client;
 12 | use crate::utility::guc;
 13 | use regex::Regex;
 14 | 
 15 | #[pg_guard]
 16 | #[no_mangle]
 17 | pub extern "C" fn background_worker_transformer_client(_arg: pg_sys::Datum) {
 18 | 
 19 |     let max_transformer_retries = guc::get_guc(guc::PgAutoDWGuc::TransformerServerMaxRetries).unwrap();
 20 |     let max_transformer_retries: u64 = max_transformer_retries.parse().expect("TransformerServerMaxRetries Not Valid u64");
 21 | 
 22 |     let database_name_string = guc::get_guc(guc::PgAutoDWGuc::DatabaseName);
 23 |     let database_name_o: Option<&str> = database_name_string.as_deref();
 24 | 
 25 |     BackgroundWorker::attach_signal_handlers(SignalWakeFlags::SIGHUP | SignalWakeFlags::SIGTERM);
 26 |     BackgroundWorker::connect_worker_to_spi(database_name_o, None);
 27 | 
 28 |     // Initialize Tokio runtime
 29 |     let runtime = Runtime::new().expect("Failed to create Tokio runtime");
 30 | 
 31 |     while BackgroundWorker::wait_latch(Some(Duration::from_secs(10))) {
 32 | 
 33 |             extension_log("BGWorker: Transformer Client", "INFO", "Beginning Transformer Background Process.");
 34 |         
 35 |             // Load Prompts into Results
 36 |             let result: Result<Vec<source_objects::SourceTablePrompt>, pgrx::spi::Error> = BackgroundWorker::transaction(|| {
 37 |                 Spi::connect(|client| {
 38 |                     let source_objects_json = client.select(queries::SOURCE_OBJECTS_JSON, None, None)?;
 39 |                     let mut v_source_table_prompts: Vec<source_objects::SourceTablePrompt> = Vec::new();
 40 |                     for source_object_json in source_objects_json {
 41 | 
 42 |                         let table_oid = source_object_json.get_datum_by_ordinal(1)?.value::<Oid>()?.unwrap();
 43 |                         let table_column_links = source_object_json.get_datum_by_ordinal(2)?.value::<pgrx::Json>()?.unwrap();
 44 |                         let table_details = source_object_json.get_datum_by_ordinal(3)?.value::<pgrx::Json>()?.unwrap();
 45 | 
 46 |                         let source_table_prompt = source_objects::SourceTablePrompt{
 47 |                                                                                         key: table_oid, 
 48 |                                                                                         table_column_links: table_column_links, 
 49 |                                                                                         table_details: table_details
 50 |                                                                                     };
 51 |                         v_source_table_prompts.push(source_table_prompt)
 52 |                     }
 53 |                     Ok(v_source_table_prompts)
 54 |                 })
 55 |             });
 56 | 
 57 |             // Get Prompts for Processing
 58 |             let v_source_table_prompts = result.unwrap_or_else(|e| panic!("got an error: {}", e));
 59 |             
 60 |             // Process Each Prompt
 61 |             for source_table_prompt in v_source_table_prompts {
 62 |                 log!("Starting Loop for Table Processing.");
 63 |                 let table_details_json_str = serde_json::to_string_pretty(&source_table_prompt.table_details).expect("Failed to convert JSON Table Details to pretty string");
 64 | 
 65 |                 let table_column_link_json_str = serde_json::to_string_pretty(&source_table_prompt.table_column_links).expect("Failed to convert JSON Column Links to pretty string");
 66 |                 let table_column_links_o: Option<source_objects::TableLinks> = serde_json::from_str(&table_column_link_json_str).ok();
 67 | 
 68 |                 let columns = extract_column_numbers(&table_details_json_str);
 69 | 
 70 |                 // Table Business Key Component Identification
 71 |                 let mut generation_json_business_key_component_identification: Option<serde_json::Value> = None;
 72 |                 let mut generation_json_business_key_name: Option<serde_json::Value> = None;
 73 |                 let mut business_key_component_identification: HashMap<&u32, BusinessKeyComponentIdentification> = HashMap::new();
 74 |                 let mut business_key_name: HashMap<&u32, BusinessKeyName> = HashMap::new();
 75 | 
 76 |                 // Evaluate Attributes
 77 |                 for column in &columns {
 78 |                     let mut retries = 0;
 79 |                     let mut hints = String::new();
 80 | 
 81 |                     while retries < max_transformer_retries {
 82 |                         runtime.block_on(async {
 83 |                             generation_json_business_key_component_identification = 
 84 |                                 match transformer_client::send_request(
 85 |                                     table_details_json_str.as_str(), 
 86 |                                     prompt_template::PromptTemplate::BKComponentIdentification, 
 87 |                                     column, 
 88 |                                     &hints).await {
 89 |                                 Ok(response_json) => {
 90 |                                     Some(response_json)
 91 |                                 },
 92 |                                 Err(e) => {
 93 |                                     log!("Error in transformer request, BKComponentIdentification, malformed or timed out: {}", e);
 94 |                                     hints = format!("Hint: Please ensure you provide a JSON response only.  This is your {} attempt and in that attept the following error is was given {e}.", retries + 1);
 95 |  
 96 |                                     log!("Delaying {retries} seconds for retry #{retries}.");
 97 |                                     sleep(Duration::from_secs(retries)).await;
 98 | 
 99 |                                     None
100 |                                 }
101 |                             };
102 |                         });
103 | 
104 |                         if generation_json_business_key_component_identification.is_none() {
105 |                             retries += 1;
106 | 
107 |                             if retries >= max_transformer_retries {
108 |                                 panic!("Max Transformer Retries Reached - restart backgrounder.")
109 |                             }
110 | 
111 |                             // Skip to the next iteration
112 |                             continue; 
113 |                         }
114 | 
115 |                         match serde_json::from_value::<BusinessKeyComponentIdentification>(generation_json_business_key_component_identification.clone().unwrap()) {
116 |                             Ok(bki) => {
117 |                                 business_key_component_identification.insert(column, bki);
118 |                                 break; // Successfully Decoded
119 |                             }
120 |                             Err(e) => {
121 |                                 log!("Error JSON JSON Structure not of type DescriptorSensitive: {}", e);
122 |                             }
123 |                         }
124 |                         retries += 1;
125 |                         log!("Transformer Retry No: {retries}");
126 |                     }
127 |                 }
128 | 
129 |                 // Generate Name if Identified as BK
130 |                 for column in &columns {
131 |                     let mut retries = 0;
132 |                     let mut hints = String::new();
133 | 
134 |                     match business_key_component_identification.get(column) {
135 |                         Some(bkci) => {
136 |                             if bkci.business_key_component_identification.is_business_key_component {
137 |                                 // Identify BK Name
138 |                                 while retries < max_transformer_retries {
139 |                                     runtime.block_on(async {
140 |                                         generation_json_business_key_name = 
141 |                                           match transformer_client::send_request(table_details_json_str.as_str(), prompt_template::PromptTemplate::BKName, &column, &hints).await {
142 |                                             Ok(response_json) => {
143 |                                                 Some(response_json)
144 |                                             },
145 |                                             Err(e) => {
146 |                                                 log!("Error in transformer request, BKName, malformed or timed out: {}", e);
147 |                                                 hints = format!("Hint: Please ensure you provide a JSON response only.  This is your {} attempt and in that attept the following error is was given {e}.", retries + 1);
148 |  
149 |                                                 log!("Delaying {retries} seconds for retry #{retries}.");
150 |                                                 sleep(Duration::from_secs(retries)).await;
151 |             
152 |                                                 None
153 |                                             }
154 |                                           };
155 |                                     });
156 | 
157 |                                     if generation_json_business_key_name.is_none() {
158 |                                         retries += 1;
159 | 
160 |                                         if retries >= max_transformer_retries {
161 |                                             panic!("Max Transformer Retries Reached - restart backgrounder.")
162 |                                         }
163 |             
164 |                                         // Skip to the next iteration
165 |                                         continue; 
166 |                                     }
167 | 
168 |                                     match serde_json::from_value::<BusinessKeyName>(generation_json_business_key_name.clone().unwrap()) {
169 |                                         Ok(bkn) => {
170 |                                             business_key_name.insert(column, bkn);
171 |                                             break; // Successfully Decoded
172 |                                         }
173 |                                         Err(e) => {
174 |                                             log!("Error JSON JSON Structure not of type BusinessKeyName: {}", e);
175 |                                         }
176 |                                     }
177 | 
178 |                                     retries += 1;
179 |                                 }
180 |                             } else {
181 |                                 continue; // Go do next column
182 |                             }
183 |                         }
184 |                         None => panic!("All columns should have been checked for business keys.  No BusinessKeyComponetIdentification Struct Found."),
185 |                     }
186 |                 }
187 | 
188 |                 // Identity Descriptor - Sensitive
189 |                 // let mut generation_json_descriptors_sensitive: HashMap<&u32, Option<serde_json::Value>> = HashMap::new();
190 |                 let mut descriptors_sensitive: HashMap<&u32, DescriptorSensitive> = HashMap::new();
191 |                 let mut generation_json_descriptor_sensitive: Option<serde_json::Value> = None;
192 |                 for column in &columns {
193 |                     let mut retries = 0;
194 |                     let mut hints = String::new();
195 |                     while retries < max_transformer_retries {   
196 |                     // Run the async block
197 |                         runtime.block_on(async {
198 |                             // Get Generation
199 |                             generation_json_descriptor_sensitive = 
200 |                                 match transformer_client::send_request(
201 |                                     table_details_json_str.as_str(), 
202 |                                     prompt_template::PromptTemplate::DescriptorSensitive, 
203 |                                     column, 
204 |                                     &hints).await {
205 |                                 Ok(response_json) => {
206 |                                     Some(response_json)
207 |                                 },
208 |                                 Err(e) => {
209 |                                     log!("Error in transformer request, DescriptorSensitive, malformed or timed out: {}", e);
210 |                                     hints = format!("Hint: Please ensure you provide a JSON response only.  This is your {} attempt and in that attept the following error is was given {e}.", retries + 1);
211 |                                     
212 |                                     log!("Delaying {retries} seconds for retry #{retries}.");
213 |                                     sleep(Duration::from_secs(retries)).await;
214 |                                     
215 |                                     None
216 |                                 }
217 |                             };
218 |                             // generation_json_descriptors_sensitive.insert(column, generation_json_descriptor_sensitive);
219 |                         });
220 | 
221 |                         if generation_json_descriptor_sensitive.is_none() {
222 |                             retries += 1;
223 | 
224 |                             if retries >= max_transformer_retries {
225 |                                 panic!("Max Transformer Retries Reached - restart backgrounder.")
226 |                             }
227 | 
228 |                             // Skip to the next iteration
229 |                             continue; 
230 |                         }
231 | 
232 |                         match serde_json::from_value::<DescriptorSensitive>(generation_json_descriptor_sensitive.clone().unwrap()) {
233 |                             Ok(des) => {
234 |                                 // business_key_name_opt = Some(des);
235 |                                 descriptors_sensitive.insert(column, des);
236 |                                 break; // Successfully Decoded
237 |                             }
238 |                             Err(e) => {
239 |                                 log!("Error JSON JSON Structure not of type DescriptorSensitive: {}", e);
240 |                             }
241 |                         }
242 | 
243 |                         retries += 1;
244 |                     }
245 |                 }
246 |                 
247 |                 let table_column_links = table_column_links_o.unwrap();
248 | 
249 |                // Build the SQL INSERT statement
250 |                 let mut insert_sql = String::from("INSERT INTO auto_dw.transformer_responses (fk_source_objects, model_name, category, business_key_name, confidence_score, reason) VALUES ");
251 | 
252 |                 for (index, column) in columns.iter().enumerate() {
253 | 
254 |                     let last = {index == table_column_links.column_links.len() - 1};
255 | 
256 |                     match (business_key_component_identification.get(column), business_key_name.get(column)) {
257 |                         (Some(business_key_component_identification), Some(business_key_name)) => {
258 |                             let category = "Business Key Part";
259 |                             // Calculate the overall confidence score by taking the minimum of the confidence values
260 |                             // for the identified business key and the business key name. This approach is chosen to 
261 |                             // ensure that the overall confidence reflects the weakest link, avoiding inflation of 
262 |                             // the confidence score when one value is significantly lower than the other.
263 |                             let confidence_score = 
264 |                                 business_key_component_identification.business_key_component_identification.confidence_value.min(
265 |                                     business_key_name.business_key_name_values.confidence_value);
266 |                             let bk_name = &business_key_name.business_key_name_values.name;
267 |                             let bk_identified_reason = &business_key_component_identification.business_key_component_identification.reason;
268 |                             let bk_name_reason = &business_key_name.business_key_name_values.reason;
269 |                             let reason = format!("BK Identified Reason: {}, BK Naming Reason: {}", bk_identified_reason, bk_name_reason);
270 |                             let model_name_owned = guc::get_guc(guc::PgAutoDWGuc::Model).expect("MODEL GUC is not set.");
271 |                             let model_name = model_name_owned.as_str();
272 | 
273 |                             let pk_source_objects: i32;
274 | 
275 |                             if let Some(pk_source_objects_temp) = table_column_links.find_pk_source_objects(column.clone() as i32) {
276 |                                 pk_source_objects = pk_source_objects_temp;
277 |                             } else {
278 |                                 println!("No match found for column_ordinal_position: {}", column);
279 |                                 panic!()
280 |                             }
281 |     
282 |                             if !last {
283 |                                 insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''")));
284 |                             } else {
285 |                                 insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''")));
286 |                             }
287 |         
288 |                         }
289 |                         _ => { // Not Identified as BKs
290 |                             let pk_source_objects: i32; 
291 |                             let mut category = "Descriptor";
292 |                             let mut confidence_score: f64 = 1.0;
293 |                             let bk_name = "NA";
294 |                             let mut reason = "Defaulted of category 'Descriptor' maintained.".to_string();
295 |                             let model_name_owned = guc::get_guc(guc::PgAutoDWGuc::Model).expect("MODEL GUC is not set.");
296 |                             let model_name = model_name_owned.as_str();
297 |                             
298 |                             if let Some(pk_source_objects_temp) = table_column_links.find_pk_source_objects(column.clone() as i32) {
299 |                                 pk_source_objects = pk_source_objects_temp;
300 |                             } else {
301 |                                 println!("No match found for column_ordinal_position: {}", column);
302 |                                 panic!()
303 |                             }
304 |                             
305 |                             if let Some(descriptor_sensitive) = descriptors_sensitive.get(&column) {
306 |                                 if descriptor_sensitive.descriptor_sensitive_values.is_pii && (descriptor_sensitive.descriptor_sensitive_values.confidence_value > 0.5) {
307 |                                     category = "Descriptor - Sensitive";
308 |                                     confidence_score = descriptor_sensitive.descriptor_sensitive_values.confidence_value;
309 |                                     reason = descriptor_sensitive.descriptor_sensitive_values.reason.clone();
310 |                                 }
311 |                             } else {
312 |                                 log!("Teseting Can't find a response for {} in Descriptors Sensitive Hashmap.", column);
313 |                             }
314 |     
315 |                             if !last {
316 |                                 insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}'),", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''")));
317 |                             } else {
318 |                                 insert_sql.push_str(&format!("({}, '{}', '{}', '{}', {}, '{}');", pk_source_objects, model_name, category, bk_name.replace(" ", "_"), confidence_score, reason.replace("'", "''")));
319 |                             }
320 |                         }
321 |                     }
322 |                 }
323 |                 
324 |                 // Push Generation to TABLE TRANSFORMER_RESPONSES 
325 |                 BackgroundWorker::transaction(|| {
326 |                     Spi::connect(|mut client| {
327 |                         _ = client.update(insert_sql.as_str(), None, None);
328 |                     })
329 |                 });
330 |         }
331 |         
332 |     }
333 | }
334 | 
335 | fn extension_log(process: &str, level: &str, message: &str) {
336 | 
337 |     let insert_statement = format!(r#"
338 |                                             INSERT INTO auto_dw.log (process, level, message)
339 |                                             VALUES ('{}', '{}', '{}');
340 |                                         "#, process, level, message);
341 | 
342 |     BackgroundWorker::transaction(|| {
343 |         Spi::connect(|mut client| {
344 |             _ = client.update(insert_statement.as_str(), None, None);
345 |         })
346 |     });
347 | }
348 | 
349 | fn extract_column_numbers(json_str: &str) -> Vec<u32> {
350 |     // Define a regex to capture the column numbers
351 |     let re = Regex::new(r"Column No: (\d+)").expect("Invalid regex");
352 | 
353 |     // Find all matches and collect the column numbers
354 |     re.captures_iter(json_str)
355 |         .filter_map(|caps| caps.get(1).map(|m| m.as_str().parse::<u32>().unwrap()))
356 |         .collect()
357 | }
358 | 
359 | #[derive(Deserialize, Debug)]
360 | enum TableClassificationType {
361 |     Hub,
362 |     Link,
363 | }
364 | 
365 | #[derive(Deserialize, Debug)]
366 | struct BusinessKeyComponentIdentification {
367 |     #[serde(rename = "Business Key Component Identification")]
368 |     business_key_component_identification: BusinessKeyComponentIdentificationValues,
369 | }
370 | 
371 | #[derive(Deserialize, Debug)]
372 | struct BusinessKeyComponentIdentificationValues {
373 |     #[serde(rename = "Is Business Key Component")]
374 |     is_business_key_component: bool,
375 |     #[serde(rename = "Confidence Value")]
376 |     confidence_value: f64,
377 |     #[serde(rename = "Reason")]
378 |     reason: String,
379 | }
380 | 
381 | #[derive(Deserialize, Debug)]
382 | struct BusinessKeyName {
383 |     #[serde(rename = "Business Key Name")]
384 |     business_key_name_values: BusinessKeyNameValues,
385 | }
386 | 
387 | #[derive(Deserialize, Debug)]
388 | struct BusinessKeyNameValues {
389 |     #[serde(rename = "Name")]
390 |     name: String,
391 |     #[serde(rename = "Confidence Value")]
392 |     confidence_value: f64,
393 |     #[serde(rename = "Reason")]
394 |     reason: String,
395 | }
396 | 
397 | #[derive(Deserialize, Debug)]
398 | struct DescriptorSensitive {
399 |     #[serde(rename = "Descriptor - Sensitive")]
400 |     descriptor_sensitive_values: DescriptorSensitiveValues,
401 | }
402 | 
403 | #[derive(Deserialize, Debug)]
404 | struct DescriptorSensitiveValues {
405 |     #[serde(rename = "Is PII")]
406 |     is_pii: bool,
407 |     #[serde(rename = "Confidence Value")]
408 |     confidence_value: f64,
409 |     #[serde(rename = "Reason")]
410 |     reason: String,
411 | }
412 | 
413 | 


--------------------------------------------------------------------------------
/extension/src/controller/dv_builder.rs:
--------------------------------------------------------------------------------
  1 | use pgrx::{prelude::*, pg_sys::Oid};
  2 | use uuid::Uuid;
  3 | use std::collections::HashMap;
  4 | use chrono::Utc;
  5 | 
  6 | use crate::model::queries;
  7 | use crate::utility::guc;
  8 | use crate::model::dv_schema::{
  9 |                                 DVSchema,
 10 |                                 LinkKey,
 11 |                                 BusinessKey, 
 12 |                                 BusinessKeyPartLink, 
 13 |                                 Descriptor, 
 14 |                                 DescriptorLink, 
 15 |                                 ColumnData
 16 |                             };
 17 | 
 18 | use super::dv_loader::*;
 19 | 
 20 | pub fn build_dv(build_id: Uuid, dv_objects_query: &str, load_data: bool) {
 21 | 
 22 |     let mut dv_objects_hm: HashMap<Oid, Vec<TransformerObject>> = HashMap::new();
 23 | 
 24 |     Spi::connect(|client| 
 25 |         {
 26 |             let dv_objects_result = client.select(dv_objects_query, None, None);
 27 | 
 28 |             match dv_objects_result {
 29 | 
 30 |                 Ok(dv_objects) => {
 31 | 
 32 |                     for dv_object in dv_objects {
 33 | 
 34 |                         let schema_name = dv_object.get_datum_by_ordinal(1).unwrap().value::<String>().unwrap().unwrap();
 35 |                         let table_name = dv_object.get_datum_by_ordinal(2).unwrap().value::<String>().unwrap().unwrap();
 36 |                         let column_category = dv_object.get_datum_by_ordinal(3).unwrap().value::<String>().unwrap().unwrap();
 37 |                         let business_key_name = dv_object.get_datum_by_ordinal(4).unwrap().value::<String>().unwrap().unwrap();
 38 |                         let column_name = dv_object.get_datum_by_ordinal(5).unwrap().value::<String>().unwrap().unwrap();
 39 |                         let column_type_name = dv_object.get_datum_by_ordinal(6).unwrap().value::<String>().unwrap().unwrap();
 40 |                         let system_id = dv_object.get_datum_by_ordinal(7).unwrap().value::<i64>().unwrap().unwrap();
 41 |                         let table_oid: Oid = dv_object.get_datum_by_ordinal(8).unwrap().value::<Oid>().unwrap().unwrap();
 42 |                         let column_ordinal_position = dv_object.get_datum_by_ordinal(9).unwrap().value::<i16>().unwrap().unwrap();
 43 |                         
 44 |                         let column_category = ColumnCategory::from_str(&column_category);
 45 | 
 46 |                         let transformer_object: TransformerObject = 
 47 |                             TransformerObject { 
 48 |                                 schema_name, 
 49 |                                 table_name,
 50 |                                 business_key_name,
 51 |                                 column_name, 
 52 |                                 column_type_name, 
 53 |                                 system_id, 
 54 |                                 table_oid, 
 55 |                                 column_ordinal_position, 
 56 |                                 column_category, 
 57 |                             };
 58 | 
 59 |                         // Bucket TransformerObject by table
 60 |                         dv_objects_hm
 61 |                             .entry(table_oid)
 62 |                             .or_insert_with(Vec::new)
 63 |                             .push(transformer_object);
 64 | 
 65 |                     }
 66 |                 }
 67 | 
 68 |                 Err(e) => {
 69 |                     log!("Error getting DV Transformer Objects Result: {:?}", e);
 70 |                 }
 71 |             }
 72 |         }
 73 |     );
 74 | 
 75 |     // Ensure ordering based on column ordinality for consistent processing.
 76 |     for dv_object in dv_objects_hm.values_mut() {
 77 |         dv_object.sort_by_key(|dv_object| dv_object.column_ordinal_position);
 78 |     }
 79 | 
 80 |     let (
 81 |             dv_objects_hm_single_bkp,
 82 |             dv_objects_hm_multiple_bkp, 
 83 |         ) = separate_by_business_parts(dv_objects_hm);
 84 | 
 85 |     // Build a Vector of LinkKeys
 86 |     let mut link_keys: Vec<LinkKey> = Vec::new();
 87 | 
 88 |     for dv_objects_v in dv_objects_hm_multiple_bkp {
 89 | 
 90 |         let mut descriptors: Vec<Descriptor> = Vec::new();
 91 | 
 92 |         // Build Descriptors
 93 |         for dv_object in &dv_objects_v.1 {
 94 | 
 95 |             let column_data_id = Uuid::new_v4();
 96 | 
 97 |             let column_data = ColumnData {
 98 |                 id: column_data_id,
 99 |                 system_id: dv_object.system_id,
100 |                 schema_name: dv_object.schema_name.clone(),
101 |                 table_oid: dv_object.table_oid,
102 |                 table_name: dv_object.table_name.clone(),
103 |                 column_name: dv_object.column_name.clone(),
104 |                 column_ordinal_position: dv_object.column_ordinal_position,
105 |                 column_type_name: dv_object.column_type_name.clone(),
106 |             };
107 |             let orbit = dv_object.table_name.clone();
108 | 
109 |             if dv_object.column_category == ColumnCategory::Descriptor {
110 |                 let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, false);
111 |                 descriptors.push(descriptor);
112 |             } else if dv_object.column_category == ColumnCategory::DescriptorSensitive {
113 |                 let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, true);
114 |                 descriptors.push(descriptor);
115 |             }
116 |         }
117 | 
118 |         let mut business_keys: Vec<BusinessKey> = Vec::new();
119 |         // Build Business keys
120 |         for dv_object in &dv_objects_v.1 {
121 | 
122 |             if dv_object.column_category == ColumnCategory::BusinessKeyPart {
123 |                 let column_data_id = Uuid::new_v4();
124 | 
125 |                 let column_data = ColumnData {
126 |                     id: column_data_id,
127 |                     system_id: dv_object.system_id,
128 |                     schema_name: dv_object.schema_name.clone(),
129 |                     table_oid: dv_object.table_oid,
130 |                     table_name: dv_object.table_name.clone(),
131 |                     column_name: dv_object.column_name.clone(),
132 |                     column_ordinal_position: dv_object.column_ordinal_position,
133 |                     column_type_name: dv_object.column_type_name.clone(),
134 |                 };
135 |     
136 |                 let mut business_key_part_links: Vec<BusinessKeyPartLink> = Vec::new();
137 |     
138 |                 if dv_object.column_category == ColumnCategory::BusinessKeyPart {
139 |                     // Alias good for 1 BKP, Refactor for many BKPs
140 |                     let business_key_part_link = get_business_key_part_link(dv_object.business_key_name.clone(), column_data);
141 |                     business_key_part_links.push(business_key_part_link);
142 |                 }
143 |     
144 |                 let business_key_name = dv_object.business_key_name.to_lowercase().clone();
145 |     
146 |                 let business_key_id = Uuid::new_v4();
147 |     
148 |                 let business_key = BusinessKey {
149 |                     id: business_key_id,
150 |                     name: business_key_name,
151 |                     business_key_part_links,
152 |                     descriptors: Vec::new(), // Descriptors place on Link Key not Business Key
153 |                 };
154 |     
155 |                 business_keys.push(business_key);
156 |             }
157 |         }
158 | 
159 |         let link_key_name = business_keys
160 |                                     .iter()
161 |                                     .map(|bk| bk.name.as_str())
162 |                                     .collect::<Vec<&str>>()
163 |                                     .join("_");
164 | 
165 |         let link_key_id = Uuid::new_v4();
166 | 
167 |         let link_key = LinkKey {
168 |             id: link_key_id,
169 |             name: link_key_name,
170 |             business_keys,
171 |             descriptors,
172 |         };
173 | 
174 |         link_keys.push(link_key);  
175 |     }
176 | 
177 |     // Shadowing to remove mutability.
178 |     let link_keys = link_keys;
179 | 
180 |     // Build a Vector of BusinessKeys
181 |     let mut business_keys: Vec<BusinessKey> = Vec::new();
182 |     for dv_objects_v in dv_objects_hm_single_bkp {
183 | 
184 |         let mut descriptors: Vec<Descriptor> = Vec::new();
185 |         let mut business_key_part_links: Vec<BusinessKeyPartLink> = Vec::new();
186 | 
187 |         // Build Descriptors
188 |         for dv_object in &dv_objects_v.1 {
189 | 
190 |             let column_data_id = Uuid::new_v4();
191 | 
192 |             let column_data = ColumnData {
193 |                 id: column_data_id,
194 |                 system_id: dv_object.system_id,
195 |                 schema_name: dv_object.schema_name.clone(),
196 |                 table_oid: dv_object.table_oid,
197 |                 table_name: dv_object.table_name.clone(),
198 |                 column_name: dv_object.column_name.clone(),
199 |                 column_ordinal_position: dv_object.column_ordinal_position,
200 |                 column_type_name: dv_object.column_type_name.clone(),
201 |             };
202 |             let orbit = dv_object.table_name.clone();
203 | 
204 |             if dv_object.column_category == ColumnCategory::Descriptor {
205 |                 let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, false);
206 |                 descriptors.push(descriptor);
207 |             } else if dv_object.column_category == ColumnCategory::DescriptorSensitive {
208 |                 let descriptor = get_descriptor(dv_object.column_name.clone(), column_data, orbit, true);
209 |                 descriptors.push(descriptor);
210 |             }
211 |         }
212 | 
213 |         // Build Business Key Part Links
214 |         for dv_object in &dv_objects_v.1 {
215 | 
216 |             let column_data_id = Uuid::new_v4();
217 | 
218 |             let column_data = ColumnData {
219 |                 id: column_data_id,
220 |                 system_id: dv_object.system_id,
221 |                 schema_name: dv_object.schema_name.clone(),
222 |                 table_oid: dv_object.table_oid,
223 |                 table_name: dv_object.table_name.clone(),
224 |                 column_name: dv_object.column_name.clone(),
225 |                 column_ordinal_position: dv_object.column_ordinal_position,
226 |                 column_type_name: dv_object.column_type_name.clone(),
227 |             };
228 | 
229 |             if dv_object.column_category == ColumnCategory::BusinessKeyPart {
230 |                  // Alias good for 1 BKP, Refactor for many BKPs
231 |                 let business_key_part_link = get_business_key_part_link(dv_object.business_key_name.clone(), column_data);
232 |                 business_key_part_links.push(business_key_part_link);
233 |             }
234 |         }
235 | 
236 |         // TODO: Handle multiple business keys for link tables. Ensure appropriate error handling!
237 |         let business_key_name: String = {
238 |             let mut business_key_name = String::new();
239 |             for dv_object in &dv_objects_v.1 {
240 |                 if dv_object.business_key_name.to_lowercase() != "na" {
241 |                     business_key_name = dv_object.business_key_name.to_lowercase().clone();
242 |                 }
243 |             }
244 |             business_key_name
245 |         };
246 | 
247 |         let business_key_id = Uuid::new_v4();
248 |         let business_key = BusinessKey {
249 |             id: business_key_id,
250 |             name: business_key_name,
251 |             business_key_part_links,
252 |             descriptors 
253 |         };
254 | 
255 |         business_keys.push(business_key);
256 |     }
257 | 
258 |     // Shadowing to remove mutability.
259 |     let business_keys = business_keys;
260 | 
261 |     let dw_schema = guc::get_guc(guc::PgAutoDWGuc::DwSchema).expect("DW SCHEMA GUC is not set.");
262 | 
263 |     // Build DV
264 |     // Push DV Function
265 |     let mut dv_ddl_sql = String::new();
266 | 
267 |     for business_key in &business_keys {
268 |         let dv_business_key_ddl_sql = build_sql_from_business_key(&dw_schema, business_key);
269 |         dv_ddl_sql.push_str(&dv_business_key_ddl_sql);
270 |     }
271 | 
272 |     for link_key in &link_keys {
273 |         let dv_link_key_ddl_sql = build_sql_from_link_key(&dw_schema, link_key);
274 |         dv_ddl_sql.push_str(&dv_link_key_ddl_sql);
275 |     }
276 | 
277 |     log!("Running DV DDL: {}", dv_ddl_sql);
278 | 
279 |     // Build Tables using DDL
280 |     Spi::connect( |mut client| {
281 |             _ = client.update(&dv_ddl_sql, None, None);
282 |             log!("DV Tables Built");
283 |         }
284 |     );
285 | 
286 |     // Build DVTransformerSchema
287 | 
288 |     // Get the current time in GMT
289 |     let now_gmt = Utc::now().naive_utc();
290 | 
291 |     let mut dv_schema = DVSchema {
292 |         id: build_id,
293 |         dw_schema,
294 |         create_timestamp_gmt: now_gmt,
295 |         modified_timestamp_gmt: now_gmt,
296 |         business_keys,
297 |         link_keys,
298 |     };
299 | 
300 |     // Add Target Columns to dv_schema links.
301 | 
302 |     dv_schema_add_target_columns(&mut dv_schema);
303 | 
304 |     log!("DV Schema JSON: {:#?}", dv_schema);
305 | 
306 |     dv_schema_push_to_repo(&build_id.to_string(), &mut dv_schema);
307 | 
308 |     // ToDo: Remove as this is redundant and for testing purposes.  However, this function will be integral for future data refreshes.
309 |     match dv_load_schema_from_build_id(&build_id.to_string()) {
310 |         Some(schema) => {
311 |             dv_schema = schema;
312 |         }
313 |         None => {
314 |             panic!("Repo Error")
315 |         }
316 |     };
317 | 
318 |     insert_dw_source_columns(&dv_schema);
319 | 
320 |     if load_data {dv_data_loader(&dv_schema);}
321 | 
322 | }
323 | 
324 | fn insert_dw_source_columns(dv_schema: &DVSchema) {
325 | 
326 |     let insert_dw_source_column: &str = r#"
327 |         INSERT INTO auto_dw.dw_source_objects (table_oid, column_ordinal_position)
328 |         VALUES ($1, $2)
329 |     "#; 
330 | 
331 |     for column in dv_schema.get_columns() {
332 |         let table_oid = column.0;
333 |         let column_ordinal_position = column.1;
334 | 
335 |         log!("DV'd Table: {}, Col: {column_ordinal_position}", table_oid.as_u32());
336 | 
337 |         Spi::connect( |mut client| {
338 |             _ = client.update(insert_dw_source_column, None, 
339 |                 Some(vec![
340 |                     (PgOid::from(pg_sys::OIDOID), table_oid.into_datum()),
341 |                     (PgOid::from(pg_sys::INT2OID), column_ordinal_position.into_datum()),
342 |                 ]));
343 |             }
344 |         );
345 |     }
346 | }
347 | 
348 | fn dv_schema_push_to_repo(build_id: &String, dv_schema: &mut DVSchema) {
349 | 
350 |     let now_gmt = Utc::now().naive_utc();
351 | 
352 |     dv_schema.modified_timestamp_gmt = now_gmt;
353 | 
354 |     let insert_schema_query: &str = r#"
355 |         INSERT INTO auto_dw.dv_repo (build_id, schema)
356 |         VALUES ($1, $2)
357 |         "#; 
358 | 
359 |     let repo_json_string = serde_json::to_string(dv_schema).unwrap();
360 | 
361 |     // Build Tables using DDL
362 |     Spi::connect( |mut client| {
363 |         _ = client.update(insert_schema_query, None, 
364 |             Some(vec![
365 |                 (PgOid::from(pg_sys::TEXTOID), build_id.into_datum()),
366 |                 (PgOid::from(pg_sys::JSONOID), repo_json_string.into_datum()),
367 |             ]));
368 |         }
369 |     );
370 | 
371 | }
372 | 
373 | fn dv_schema_add_target_columns(dv_schema: &mut DVSchema) {
374 | 
375 |     for link_key in &mut dv_schema.link_keys {
376 |         for descriptor in &mut link_key.descriptors {
377 |             descriptor_add_target_columns(&dv_schema.dw_schema, descriptor);
378 |         }
379 | 
380 |         for business_key in &mut link_key.business_keys {
381 |             for business_key_part_link in &mut business_key.business_key_part_links {
382 |                 business_key_part_link_add_hub_target_column(&dv_schema.dw_schema,&business_key.name, business_key_part_link);
383 |             }
384 |         }
385 |     }
386 | 
387 |     for business_key in &mut dv_schema.business_keys {
388 | 
389 |         // For Descriptors in Business Keys
390 |         for descriptor in &mut business_key.descriptors {
391 |             descriptor_add_target_columns(&dv_schema.dw_schema, descriptor);
392 |         }
393 | 
394 |         // For Business Key Parts in Business Keys
395 |         for business_key_part_link in &mut business_key.business_key_part_links {
396 |             business_key_part_link_add_hub_target_column(&dv_schema.dw_schema,&business_key.name, business_key_part_link);
397 |         }
398 |         
399 |     }
400 | }
401 | 
402 | fn business_key_part_link_add_hub_target_column(schema_name: &String, business_key_name: &String, business_key_part_link: &mut BusinessKeyPartLink ) {
403 | 
404 |     let table_name = &{"hub_".to_string() + business_key_name};
405 |     let column_name = &(business_key_part_link.alias.clone() + "_bk");
406 | 
407 |     let get_column_data= queries::get_column_data(schema_name, table_name, column_name);
408 | 
409 |     let column_data: Option<ColumnData> = Spi::connect( |client| {
410 | 
411 |         match client.select(&get_column_data, None, None) {
412 |             Ok(column_data) => {
413 |                 // Only 0 or 1 record should be returned.
414 |                 if let Some(column_data_record) = column_data.into_iter().next() {
415 |                     let system_id =  column_data_record.get_datum_by_ordinal(1).unwrap().value::<i64>().unwrap().unwrap();
416 |                     let _schema_oid =  column_data_record.get_datum_by_ordinal(2).unwrap().value::<Oid>().unwrap().unwrap();
417 |                     let schema_name =  column_data_record.get_datum_by_ordinal(3).unwrap().value::<String>().unwrap().unwrap();
418 |                     let table_name =  column_data_record.get_datum_by_ordinal(4).unwrap().value::<String>().unwrap().unwrap();
419 |                     let table_oid =  column_data_record.get_datum_by_ordinal(5).unwrap().value::<Oid>().unwrap().unwrap();
420 |                     let column_name =  column_data_record.get_datum_by_ordinal(6).unwrap().value::<String>().unwrap().unwrap();
421 |                     let column_ordinal_position =  column_data_record.get_datum_by_ordinal(7).unwrap().value::<i16>().unwrap().unwrap(); 
422 |                     let column_type_name =  column_data_record.get_datum_by_ordinal(8).unwrap().value::<String>().unwrap().unwrap();
423 | 
424 |                     let column_id = Uuid::new_v4();
425 | 
426 |                     let column_data = ColumnData {
427 |                         id: column_id,
428 |                         system_id,
429 |                         schema_name,
430 |                         table_oid,
431 |                         table_name,
432 |                         column_name,
433 |                         column_ordinal_position,
434 |                         column_type_name,
435 |                     };
436 | 
437 |                     return Some(column_data)
438 |                     
439 |                 } else {
440 |                     log!("Column Data Not available.");
441 |                     
442 |                 }
443 |                 return None
444 |             }
445 |             Err(e) => {
446 |                 log!("Target Column Data Error: {:?}", e);
447 |                 return None
448 |             }
449 |         }
450 |     });
451 | 
452 |     business_key_part_link.hub_target_column = column_data;
453 | 
454 | }
455 | 
456 | fn descriptor_add_target_columns(schema_name: &String, descriptor: &mut Descriptor ) {
457 |     
458 |     let table_name = &{"sat_".to_string() + &descriptor.orbit + {if descriptor.is_sensitive { "_sensitive" } else {""}}};
459 |     let column_name = &descriptor.descriptor_link.alias;
460 |     
461 |     let get_column_data = queries::get_column_data(schema_name, table_name, column_name);
462 | 
463 |     let column_data: Option<ColumnData> = Spi::connect( |client| {
464 | 
465 |         match client.select(&get_column_data, None, None) {
466 |             Ok(column_data) => {
467 |                 // Only 0 or 1 record should be returned.
468 |                 if let Some(column_data_record) = column_data.into_iter().next() {
469 |                     let system_id =  column_data_record.get_datum_by_ordinal(1).unwrap().value::<i64>().unwrap().unwrap();
470 |                     let _schema_oid =  column_data_record.get_datum_by_ordinal(2).unwrap().value::<Oid>().unwrap().unwrap();
471 |                     let schema_name =  column_data_record.get_datum_by_ordinal(3).unwrap().value::<String>().unwrap().unwrap();
472 |                     let table_name =  column_data_record.get_datum_by_ordinal(4).unwrap().value::<String>().unwrap().unwrap();
473 |                     let table_oid =  column_data_record.get_datum_by_ordinal(5).unwrap().value::<Oid>().unwrap().unwrap();
474 |                     let column_name =  column_data_record.get_datum_by_ordinal(6).unwrap().value::<String>().unwrap().unwrap();
475 |                     let column_ordinal_position =  column_data_record.get_datum_by_ordinal(7).unwrap().value::<i16>().unwrap().unwrap(); 
476 |                     let column_type_name =  column_data_record.get_datum_by_ordinal(8).unwrap().value::<String>().unwrap().unwrap();
477 | 
478 |                     let column_id = Uuid::new_v4();
479 | 
480 |                     let column_data = ColumnData {
481 |                         id: column_id,
482 |                         system_id,
483 |                         schema_name,
484 |                         table_oid,
485 |                         table_name,
486 |                         column_name,
487 |                         column_ordinal_position,
488 |                         column_type_name,
489 |                     };
490 | 
491 |                     return Some(column_data)
492 |                     
493 |                 } else {
494 |                     log!("Column Data Not available.");
495 |                 }
496 |                 return None
497 |             }
498 |             Err(e) => {
499 |                 log!("Target Column Data Error: {:?}", e);
500 |                 return None
501 |             }
502 |         }
503 |     });
504 | 
505 |     descriptor.descriptor_link.target_column = column_data;
506 | }
507 | 
508 | fn get_descriptor(column_name: String, column_data: ColumnData, orbit: String, is_sensitive: bool) -> Descriptor {
509 |     let descriptor_link_id = Uuid::new_v4();
510 |     let descriptor_link = DescriptorLink {
511 |         id: descriptor_link_id,
512 |         alias: column_name, // TODO: Give the user an option to change name in the future - modality TBD.
513 |         source_column: Some(column_data),
514 |         target_column: None,
515 |     };
516 |     let descriptor_id = Uuid::new_v4();
517 |     let descriptor = Descriptor {
518 |         id: descriptor_id,
519 |         descriptor_link,
520 |         orbit,
521 |         is_sensitive,
522 |     };
523 | 
524 |     descriptor
525 | }
526 | 
527 | fn get_business_key_part_link(alias: String, column_data: ColumnData) -> BusinessKeyPartLink {
528 |     let business_key_part_link_id = Uuid::new_v4();
529 |     let mut sources_column_data: Vec<ColumnData> = Vec::new(); 
530 |     sources_column_data.push(column_data);
531 | 
532 |     let business_key_link = BusinessKeyPartLink {
533 |         id: business_key_part_link_id,
534 |         alias,
535 |         source_columns: sources_column_data,
536 |         hub_target_column: None,
537 |     };
538 | 
539 |     business_key_link
540 | }
541 | 
542 | fn build_sql_from_link_key(dw_schema: &String, link_key: &LinkKey) -> String {
543 | 
544 |     let mut dv_link_key_ddl_sql = String::new();
545 | 
546 |     let link_key_name = &link_key.name;
547 | 
548 |     let mut bk_name_types: Vec<String> = Vec::new();
549 | 
550 |     for bk in &link_key.business_keys {
551 |         let bk_name_type = format!("hub_{}_hk VARCHAR", bk.name);
552 |         bk_name_types.push(bk_name_type);
553 |     }
554 | 
555 |     let bk_name_types_subsql = bk_name_types.join(",\n");
556 | 
557 |     dv_link_key_ddl_sql +=
558 |         &format!(r#"
559 |             CREATE TABLE IF NOT EXISTS {dw_schema}.link_{link_key_name} (
560 |                 link_{link_key_name}_hk VARCHAR NOT NULL, 
561 |                 load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL,
562 |                 record_source VARCHAR NOT NULL,
563 |                 {bk_name_types_subsql}
564 |             );
565 |         "#);
566 | 
567 |         // Sat Buildout
568 |         let mut satellite_sqls: HashMap<String, String> = HashMap::new(); 
569 | 
570 |         for descriptor in &link_key.descriptors {
571 |     
572 |             let sensitive_string = {
573 |                 if descriptor.is_sensitive == true {
574 |                     "_sensitive".to_string()
575 |                 } else {
576 |                     "".to_string()
577 |                 }
578 |             };
579 |     
580 |             let satellite_sql_key = descriptor.orbit.clone() + &sensitive_string;
581 |             let desc_column_name = &descriptor.descriptor_link.alias;
582 |             let desc_column_type = &descriptor.descriptor_link.source_column.as_ref().unwrap().column_type_name;
583 |             let sat_descriptor_sql_part: String = format!(",\n    {} {}", desc_column_name, desc_column_type);
584 |     
585 |             if let Some(existing_sat_sql) = satellite_sqls.get_mut(&satellite_sql_key) {
586 |                 if let Some(pos) = existing_sat_sql.find(");") {
587 |                     existing_sat_sql.insert_str(pos, &sat_descriptor_sql_part);
588 |                 } else {
589 |                     println!("The substring \");\" was not found in the original string.");
590 |                 }
591 |             } else {
592 |                 let begin_sat_sql = 
593 |                     format!(r#"
594 |                         CREATE TABLE {}.sat_{} (
595 |                             link_{}_hk VARCHAR NOT NULL,
596 |                             load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL,
597 |                             record_source VARCHAR NOT NULL,
598 |                             sat_{}_hd VARCHAR NOT NULL{});
599 |                     "#, dw_schema, satellite_sql_key, link_key.name, satellite_sql_key, sat_descriptor_sql_part);
600 |                 satellite_sqls.insert(satellite_sql_key, begin_sat_sql);
601 |             }
602 |         }
603 | 
604 |         for satellite_sql in satellite_sqls {
605 |             dv_link_key_ddl_sql.push_str(&satellite_sql.1);
606 |         }
607 | 
608 |         for business_key in &link_key.business_keys {
609 |             dv_link_key_ddl_sql.push_str(&build_sql_from_business_key(dw_schema, business_key));
610 |         }
611 | 
612 |     dv_link_key_ddl_sql
613 | }
614 | 
615 | fn build_sql_from_business_key(dw_schema: &String, business_key: &BusinessKey) -> String {
616 |     let mut dv_business_key_ddl_sql = String::new();
617 | 
618 |     // Hub Buildout
619 |     let mut hub_bks = String::new();
620 | 
621 |     for part_link in &business_key.business_key_part_links {
622 |         let r = format!(r#",
623 |             {}_bk VARCHAR"#, part_link.alias);
624 |         hub_bks.push_str(&r);
625 |     }
626 | 
627 |     let hub_sql = 
628 |     format!(r#"
629 |         CREATE TABLE IF NOT EXISTS {}.hub_{} (
630 |             hub_{}_hk VARCHAR NOT NULL,
631 |             load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL,
632 |             record_source VARCHAR NOT NULL{}
633 |         );
634 |     "#, dw_schema, business_key.name, business_key.name, hub_bks);
635 | 
636 |     dv_business_key_ddl_sql.push_str(&format!(
637 |         r#"
638 |         {}"#, hub_sql));
639 | 
640 |     // Sat Buildout
641 |     let mut satellite_sqls: HashMap<String, String> = HashMap::new(); 
642 | 
643 |     for descriptor in &business_key.descriptors {
644 | 
645 |         let sensitive_string = {
646 |             if descriptor.is_sensitive == true {
647 |                 "_sensitive".to_string()
648 |             } else {
649 |                 "".to_string()
650 |             }
651 |         };
652 | 
653 |         let satellite_sql_key = descriptor.orbit.clone() + &sensitive_string;
654 |         let desc_column_name = &descriptor.descriptor_link.alias;
655 |         let desc_column_type = &descriptor.descriptor_link.source_column.as_ref().unwrap().column_type_name;
656 |         let sat_descriptor_sql_part: String = format!(",\n    {} {}", desc_column_name, desc_column_type);
657 | 
658 |         if let Some(existing_sat_sql) = satellite_sqls.get_mut(&satellite_sql_key) {
659 |             if let Some(pos) = existing_sat_sql.find(");") {
660 |                 existing_sat_sql.insert_str(pos, &sat_descriptor_sql_part);
661 |             } else {
662 |                 println!("The substring \");\" was not found in the original string.");
663 |             }
664 |         } else {
665 |             let begin_sat_sql = 
666 |                 format!(r#"
667 |                     CREATE TABLE {}.sat_{} (
668 |                         hub_{}_hk VARCHAR NOT NULL,
669 |                         load_ts TIMESTAMP WITHOUT TIME ZONE NOT NULL,
670 |                         record_source VARCHAR NOT NULL,
671 |                         sat_{}_hd VARCHAR NOT NULL{});
672 |                 "#, dw_schema, satellite_sql_key, business_key.name, satellite_sql_key, sat_descriptor_sql_part);
673 |             satellite_sqls.insert(satellite_sql_key, begin_sat_sql);
674 |         }
675 |     }
676 | 
677 |     for satellite_sql in satellite_sqls {
678 |         dv_business_key_ddl_sql.push_str(&satellite_sql.1);
679 |     }
680 | 
681 |     dv_business_key_ddl_sql
682 | }
683 | 
684 | #[derive(Debug, PartialEq)]
685 | enum ColumnCategory {
686 |     BusinessKeyPart,
687 |     Descriptor,
688 |     DescriptorSensitive,
689 | }
690 | 
691 | impl ColumnCategory {
692 |     fn from_str(input: &str) -> ColumnCategory {
693 |         match input {
694 |             "Business Key Part" => ColumnCategory::BusinessKeyPart,
695 |             "Descriptor" => ColumnCategory::Descriptor,
696 |             "Descriptor - Sensitive" => ColumnCategory::DescriptorSensitive,
697 |             _ => panic!("'{}' is not a valid ColumnCategory", input),
698 |         }
699 |     }
700 | }
701 | 
702 | #[derive(Debug)]
703 | struct TransformerObject {
704 |     #[allow(dead_code)]
705 |     schema_name: String,
706 |     table_name: String,
707 |     business_key_name: String,
708 |     column_name: String,
709 |     column_type_name: String,
710 |     system_id: i64,
711 |     table_oid: Oid,
712 |     column_ordinal_position: i16,
713 |     column_category: ColumnCategory,
714 | }
715 | 
716 | // Separates TransformerObject with multiple business key parts
717 | fn separate_by_business_parts(dv_objects_hm_single_bkp: HashMap<Oid, Vec<TransformerObject>>) -> (HashMap<Oid, Vec<TransformerObject>>, HashMap<Oid, Vec<TransformerObject>>) {
718 | 
719 |     let mut single_business_key_part: HashMap<Oid, Vec<TransformerObject>> = HashMap::new();
720 |     let mut multiple_business_key_parts: HashMap<Oid, Vec<TransformerObject>> = HashMap::new();
721 | 
722 |     for (table_oid, transformer_objects) in dv_objects_hm_single_bkp {
723 |         let business_key_count = transformer_objects.iter()
724 |             .filter(|obj| matches!(obj.column_category, ColumnCategory::BusinessKeyPart))
725 |             .count();
726 | 
727 |         if business_key_count > 1 {
728 |             multiple_business_key_parts.insert(table_oid, transformer_objects);
729 |         } else if business_key_count == 1 {
730 |             single_business_key_part.insert(table_oid, transformer_objects);
731 |         }
732 |     }
733 | 
734 |     (single_business_key_part, multiple_business_key_parts)
735 | }


--------------------------------------------------------------------------------
/extension/src/controller/dv_loader.rs:
--------------------------------------------------------------------------------
  1 | use pgrx::prelude::*;
  2 | use std::collections::HashMap;
  3 | use crate::model::dv_schema::*;
  4 | 
  5 | 
  6 | pub fn get_dv_schemas() -> Vec<DVSchema> {
  7 | 
  8 |     // get DV_SCHEMAS via Query
  9 |     let get_schemas_query: &str = r#"
 10 |         SELECT schema
 11 |         FROM auto_dw.dv_repo;
 12 |     "#;
 13 | 
 14 |     // Load schemas
 15 |     let mut dv_schemas: Vec<DVSchema> = Vec::new();
 16 |     Spi::connect( |client| {
 17 | 
 18 |         let schema_results = client.select(get_schemas_query, None, None);
 19 |         match schema_results {
 20 |             Ok(schema_results) => {
 21 |                 for schema_result in schema_results {
 22 |                     let schema_json = schema_result.get_datum_by_ordinal(1).unwrap().value::<pgrx::Json>().unwrap().unwrap();
 23 |                     let pgrx::Json(schema_json_value) = schema_json;
 24 |                     let dv_schema: Result<DVSchema, serde_json::Error> = serde_json::from_value(schema_json_value);
 25 | 
 26 |                     match dv_schema {
 27 |                         Ok(dv_schema) => dv_schemas.push(dv_schema),
 28 |                         Err(e) => panic!("Failure to unwrap dv_schema, error: {e}"),
 29 |                     }
 30 |                 }
 31 |             },
 32 |             Err(e) => panic!("Get Schemas Query Failure, error: {e}"),
 33 |         }
 34 |     });
 35 | 
 36 |     dv_schemas
 37 | }
 38 | 
 39 | // Load All DV Schemas
 40 | 
 41 | pub fn dv_load_schemas_all() -> bool {
 42 | 
 43 |     for dv_schema in get_dv_schemas() {
 44 |         dv_data_loader(&dv_schema);
 45 |         log!("DV Schema (Build ID) Loaded: {}", dv_schema.id.to_string())
 46 |     }
 47 |     true
 48 | }
 49 | 
 50 | pub fn dv_load_schema_from_build_id(build_id: &String) -> Option<DVSchema> {
 51 |     let get_schema_query: &str = r#"
 52 |         SELECT schema
 53 |         FROM auto_dw.dv_repo
 54 |         WHERE build_id = $1
 55 |     "#;
 56 | 
 57 |     // Variable to store the result
 58 |     let mut schema_result: Option<DVSchema> = None;
 59 | 
 60 |     // Load Schema w/ Build ID
 61 |     Spi::connect( |client| {
 62 |         let results = client.select(get_schema_query, None, 
 63 |             Some(vec![
 64 |                 (PgOid::from(pg_sys::TEXTOID), build_id.into_datum()),
 65 |             ]));
 66 | 
 67 |         match results {
 68 |             Ok(results) => {
 69 |                 if let Some(result) = results.into_iter().next() {
 70 |                     let schema_json = result.get_datum_by_ordinal(1).unwrap().value::<pgrx::Json>().unwrap().unwrap();
 71 |                     let deserialized_schema: Result<DVSchema, serde_json::Error> = serde_json::from_value(schema_json.0);
 72 |                     match deserialized_schema {
 73 |                         Ok(deserialized_schema) => {
 74 |                             schema_result = Some(deserialized_schema);
 75 |                         },
 76 |                         Err(_) => {
 77 |                             log!("Schema could not be deserialized");
 78 |                         },
 79 |                     }
 80 |                 }
 81 |             },
 82 |             Err(_) => {
 83 |                 log!("Schema could not deserialized");
 84 |             },
 85 |         }
 86 | 
 87 |     });
 88 |     return schema_result;
 89 | }
 90 | 
 91 | // Refreshes based on dv_schema
 92 | pub fn dv_data_loader(dv_schema: &DVSchema) {
 93 | 
 94 |     // Create DML: Link Load SQL for Link Objects in DV Schema
 95 |     let link_dmls = create_dv_link_dml_for_lks(dv_schema);
 96 | 
 97 |     // Create DML: Hub Load SQL for BuskinesKey and LinkKey Objects in DV Schema
 98 |     let hub_dmls = create_dv_hub_dml_for_bks(dv_schema) + &create_dv_hub_dml_for_lks(dv_schema);
 99 | 
100 |     // Create DML: Satellite Load SQL for BK Objects in DV Schema 
101 |     let sat_dmls = create_dv_sat_dml_for_bks_descriptors(dv_schema) + &create_dv_sat_dml_for_lks_descriptors(dv_schema);
102 | 
103 |     // Run SQL
104 |     let dv_dmls = link_dmls + &hub_dmls + &sat_dmls;
105 | 
106 |     log!("DML: {dv_dmls}");
107 |     
108 |     // Build Tables using DDL
109 |     Spi::connect( |mut client| {
110 |         // client.select(dv_objects_query, None, None);
111 |         _ = client.update(&dv_dmls, None, None);
112 |         log!("Data Pushed to DV tables.");
113 |     }
114 |     );
115 | 
116 | }
117 | 
118 | fn create_dv_link_dml_for_lks(dv_schema: &DVSchema) -> String {
119 |     let mut link_insert_dmls = String::new();
120 |     let dw_schema = &dv_schema.dw_schema;
121 | 
122 |     for link_key in &dv_schema.link_keys {
123 |         let mut link_bk_source_parts_name: Vec<String> = Vec::new();
124 |         // let mut hub_hash_sqls = String::new();
125 |         let mut hub_name_hks: Vec<String> = Vec::new();
126 |         let mut hub_hash_sqls: Vec<String> = Vec::new();
127 |         for business_key in &link_key.business_keys {
128 |             // Array Parts
129 |             let bk_parts: Vec<String> = 
130 |                     business_key.business_key_part_links
131 |                         .iter()
132 |                         .map(|part_link| format!("{}", part_link.source_columns[0].column_name))
133 |                         .collect();
134 | 
135 |             let bk_source_parts_joined = bk_parts.join("::TEXT,") + "::TEXT";
136 |             let business_key_name = &business_key.name;
137 | 
138 |             hub_hash_sqls.push(format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{bk_source_parts_joined}], ',')) AS hub_{business_key_name}_hk"));
139 |             link_bk_source_parts_name.push(bk_source_parts_joined);
140 | 
141 |             // For Insert
142 |             hub_name_hks.push(format!("hub_{business_key_name}_hk"));
143 |         }
144 |         
145 |         let link_key_name = &link_key.name;
146 |         let link_hk_parts = &(link_bk_source_parts_name.join("::TEXT, ") + "::TEXT");
147 |         let link_hk_sql = 
148 |             format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{link_hk_parts}], ',')) AS link_{link_key_name}_hk,");
149 |         let timestamp_sql = format!("(CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts,");
150 |         let record_source = 
151 |             link_key.business_keys[0].business_key_part_links[0].source_columns[0].system_id.to_string() + ":" +
152 |             &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name;
153 |         let record_source_sql = format!("'{record_source}' AS record_source,");
154 |         let hubs_hash_sql = &hub_hash_sqls.join(", \n");
155 |         let schema_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name;
156 |         let source_table_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].table_name;
157 |         let source_schema_table_sql = format!("{schema_name}.{source_table_name}");
158 |         
159 |         let hub_name_hks_sql = hub_name_hks.join(",\n");
160 |         
161 | 
162 |         let insert_sql = format!(
163 |             r#"
164 |             INSERT INTO {dw_schema}.link_{link_key_name} (
165 |                 link_{link_key_name}_hk,
166 |                 load_ts,
167 |                 record_source,
168 |                 {hub_name_hks_sql}
169 |           )"#
170 |         );
171 | 
172 |         let source_cte_sql = 
173 |             format!(
174 |                 r#"
175 |                 WITH
176 |                 stg_data AS (
177 |                     SELECT 
178 |                             {link_hk_sql}
179 |                             {timestamp_sql}
180 |                             {record_source_sql}
181 |                             {hubs_hash_sql}
182 |                     FROM    {source_schema_table_sql}
183 |                 ),
184 |                 "#
185 |             );
186 | 
187 |         let new_source_cte_sql = 
188 |             format!(
189 |                 r#"
190 |                 new_stg_data AS (
191 |                     SELECT stg_data.* FROM stg_data
192 |                     LEFT JOIN {dw_schema}.link_{link_key_name} ON stg_data.link_{link_key_name}_hk = link_{link_key_name}.link_{link_key_name}_hk
193 |                     WHERE link_{link_key_name}.link_{link_key_name}_hk IS NULL
194 |                 )
195 |                 "#
196 |             );
197 |         
198 |         let select_sql = format!(
199 |             r#"
200 |             SELECT
201 |                 link_{link_key_name}_hk,
202 |                 load_ts,
203 |                 record_source,
204 |                 {hub_name_hks_sql}
205 |             FROM new_stg_data;
206 |             "#
207 |         );
208 | 
209 |         let sql_for_link_insert = insert_sql + &source_cte_sql + &new_source_cte_sql + &select_sql;
210 |         link_insert_dmls.push_str(&sql_for_link_insert);
211 |     }
212 | 
213 |     link_insert_dmls
214 | }
215 | 
216 | fn create_dv_hub_dml_for_lks(dv_schema: &DVSchema) -> String {
217 | 
218 |     let mut link_insert_dmls = String::new();
219 | 
220 |     for link_key in &dv_schema.link_keys {
221 | 
222 |         for business_key in &link_key.business_keys {
223 |             let hub_dml = business_key_to_hub_dml(business_key, &dv_schema.dw_schema);
224 |             link_insert_dmls.push_str(&hub_dml);
225 |         }
226 | 
227 |     }
228 | 
229 |     link_insert_dmls
230 | }
231 | 
232 | fn create_dv_hub_dml_for_bks (dv_schema: &DVSchema) -> String {
233 | 
234 |     let mut hub_insert_dmls = String::new();
235 | 
236 |     for business_key in &dv_schema.business_keys {
237 |         let hub_dml = business_key_to_hub_dml(business_key, &dv_schema.dw_schema);
238 |         hub_insert_dmls.push_str(&hub_dml);
239 |     }
240 | 
241 |     hub_insert_dmls
242 | }
243 | 
244 | fn create_dv_sat_dml_for_lks_descriptors (dv_schema: &DVSchema) -> String {
245 |     let mut sat_link_insert_dmls = String::new();
246 |     let dw_schema = &dv_schema.dw_schema;
247 | 
248 |     for link_key in &dv_schema.link_keys {
249 |         let mut link_bk_source_parts_name: Vec<String> = Vec::new();
250 |         let mut hub_hash_sqls: Vec<String> = Vec::new();
251 |         for business_key in &link_key.business_keys {
252 |             // Array Parts
253 |             let bk_parts: Vec<String> = 
254 |                     business_key.business_key_part_links
255 |                         .iter()
256 |                         .map(|part_link| format!("{}", part_link.source_columns[0].column_name))
257 |                         .collect();
258 | 
259 |             let bk_source_parts_joined = bk_parts.join("::TEXT,") + "::TEXT";
260 |             let business_key_name = &business_key.name;
261 | 
262 |             hub_hash_sqls.push(format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{bk_source_parts_joined}], ',')) AS hub_{business_key_name}_hk"));
263 |             link_bk_source_parts_name.push(bk_source_parts_joined);
264 |         }
265 |         
266 |         let link_key_name = &link_key.name;
267 |         let link_hk_parts = &(link_bk_source_parts_name.join("::TEXT, ") + "::TEXT");
268 |         let link_hk_sql = 
269 |             format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{link_hk_parts}], ',')) AS link_{link_key_name}_hk,");
270 |         let timestamp_sql = format!("(CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts,");
271 |         let record_source = 
272 |             link_key.business_keys[0].business_key_part_links[0].source_columns[0].system_id.to_string() + ":" +
273 |             &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name;
274 |         let record_source_sql = format!("'{record_source}' AS record_source,");
275 |         let schema_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].schema_name;
276 |         let source_table_name = &link_key.business_keys[0].business_key_part_links[0].source_columns[0].table_name;
277 |         let source_schema_table_sql = format!("{schema_name}.{source_table_name}");
278 | 
279 |         // Closure to load Sensititive and Non-Sensitive Satellites 
280 |         let mut insert_link_sat_type = |type_string: &str, descriptors: Vec<&Descriptor>| {
281 |             
282 |             let desritpors_name: Vec<&str> = 
283 |                 descriptors
284 |                     .iter().map(|descriptor| {
285 |                         descriptor
286 |                             .descriptor_link
287 |                             .source_column.as_ref()
288 |                             .expect("Expected source_column to be Some, but found None")
289 |                             .column_name
290 |                             .as_str()
291 |                     }).collect();
292 | 
293 |             let descriptors_names_comma_seperated = desritpors_name.join(", ");
294 |             
295 |             let insert_sql = 
296 |                 format!(
297 |                     r#"
298 |                     INSERT INTO {dw_schema}.sat_{source_table_name}{type_string} (
299 |                         link_{link_key_name}_hk,
300 |                         load_ts,
301 |                         record_source,
302 |                         sat_{source_table_name}{type_string}_hd,
303 |                         {descriptors_names_comma_seperated}
304 |                     )"#
305 |                 );
306 |             
307 |             let descriptors_names_text_comma_seperated = desritpors_name.join("::TEXT, ") + "::TEXT";
308 |             let descriptors_hd_sql = 
309 |                 format!("auto_dw.hash(ARRAY_TO_STRING(ARRAY[{descriptors_names_text_comma_seperated}], ',')) AS sat_{source_table_name}{type_string}_hd,");
310 | 
311 |             let source_cte_sql = 
312 |                 format!(
313 |                     r#"
314 |                     WITH
315 |                     stg_data AS (
316 |                         SELECT 
317 |                                 {link_hk_sql}
318 |                                 {timestamp_sql}
319 |                                 {record_source_sql}
320 |                                 {descriptors_hd_sql}
321 |                                 {descriptors_names_comma_seperated}
322 |                         FROM    {source_schema_table_sql}
323 |                     ),
324 |                     "#
325 |                 );
326 | 
327 |             let new_source_cte_sql = 
328 |                 format!(
329 |                     r#"
330 |                     new_stg_data AS (
331 |                         SELECT stg_data.* FROM stg_data
332 |                         LEFT JOIN {dw_schema}.sat_{source_table_name}{type_string} ON stg_data.link_{link_key_name}_hk = sat_{source_table_name}{type_string}.link_{link_key_name}_hk
333 |                             AND stg_data.sat_{source_table_name}{type_string}_hd = sat_{source_table_name}{type_string}.sat_{source_table_name}{type_string}_hd
334 |                         WHERE sat_{source_table_name}{type_string}.link_{link_key_name}_hk IS NULL
335 |                     )
336 |                     "#
337 |                 );
338 |             
339 |             let select_sql = 
340 |                 format!(
341 |                     r#"
342 |                     SELECT
343 |                         link_{link_key_name}_hk,
344 |                         load_ts,
345 |                         record_source,
346 |                         sat_{source_table_name}{type_string}_hd,
347 |                         {descriptors_names_comma_seperated}
348 |                     FROM new_stg_data;
349 |                     "#
350 |                 );
351 | 
352 |             let sql_for_link_insert = insert_sql + &source_cte_sql + &new_source_cte_sql + &select_sql;
353 |             sat_link_insert_dmls.push_str(&sql_for_link_insert);
354 |         };
355 | 
356 |         let (descriptors_sensitive, descriptors_not_sensitive): (Vec<&Descriptor>, Vec<&Descriptor>) = 
357 |             link_key.descriptors
358 |                 .iter()
359 |                 .partition(|descriptor| descriptor.is_sensitive);
360 | 
361 |         // INSERT for Sensitive Descriptors
362 |         let has_sensitive_descriptors = !descriptors_sensitive.is_empty();
363 |         if has_sensitive_descriptors {
364 |             insert_link_sat_type("_sensitive", descriptors_sensitive);
365 |         }
366 | 
367 |         // INSERT for "Standard" (Non-Sensitive) Descriptors
368 |         let has_descriptors = !descriptors_not_sensitive.is_empty();
369 |         if has_descriptors { 
370 |             insert_link_sat_type("", descriptors_not_sensitive);
371 |         }
372 | 
373 |     }
374 | 
375 |     sat_link_insert_dmls
376 | }
377 | 
378 | fn create_dv_sat_dml_for_bks_descriptors (dv_schema: &DVSchema) -> String {
379 | 
380 |     let mut sat_insert_dmls = String::new();
381 |     let dw_schema = dv_schema.dw_schema.clone();
382 | 
383 |     for business_key in &dv_schema.business_keys {
384 | 
385 |         // Arrary Parts
386 |         let mut hub_bk_parts_sql_stg_array = String ::new();
387 |         for part_link in &business_key.business_key_part_links {
388 |             // TODO: Need acount for more than once source.  However, Vec data structure isn't ideal - refactor. 
389 |             let e = format!(r#"stg.{}::TEXT,"#, part_link.source_columns[0].column_name);
390 |             hub_bk_parts_sql_stg_array.push_str(&e);
391 |         } 
392 |         hub_bk_parts_sql_stg_array.pop(); // Removing the last ","
393 | 
394 |           // Sat Buildout
395 |         let mut sat_insert_sql_header_parts: HashMap<String, String> = HashMap::new();
396 |         let mut descriptors_for_sats: HashMap<String, Vec<&Descriptor>> = HashMap::new();
397 | 
398 |         for descriptor in &business_key.descriptors {
399 | 
400 |             let sensitive_string = {
401 |                 if descriptor.is_sensitive == true {
402 |                     "_sensitive".to_string()
403 |                 } else {
404 |                     "".to_string()
405 |                 }
406 |             };
407 | 
408 |             let satellite_sql_key = descriptor.orbit.clone() + &sensitive_string;
409 | 
410 |             descriptors_for_sats
411 |                 .entry(satellite_sql_key.clone())
412 |                 .or_insert_with(Vec::new)
413 |                 .push(&descriptor);
414 | 
415 |             let desc_column_name = &descriptor.descriptor_link.alias;
416 | 
417 |             // SAT INSERT Header 
418 |             let sat_descriptor_sql_part: String = format!(",\n    {}", desc_column_name);
419 |             if let Some(existing_sat_sql) = sat_insert_sql_header_parts.get_mut(&satellite_sql_key) {
420 |                 if let Some(pos) = existing_sat_sql.find(")") {
421 |                     existing_sat_sql.insert_str(pos, &sat_descriptor_sql_part);
422 |                 } else {
423 |                     println!("The substring \")\" was not found in the original string.");
424 |                 }
425 |             } else {
426 |                 let begin_sat_sql = 
427 |                     format!(r#"
428 |                             INSERT INTO {}.sat_{} (
429 |                                 hub_{}_hk,
430 |                                 load_ts,
431 |                                 record_source,
432 |                                 sat_{}_hd{})
433 |                             "#, 
434 |                             dw_schema, &satellite_sql_key, 
435 |                             business_key.name, 
436 |                             &satellite_sql_key, sat_descriptor_sql_part);
437 | 
438 |                 sat_insert_sql_header_parts.insert(satellite_sql_key.clone(), begin_sat_sql);
439 |             }
440 |         }
441 | 
442 |         // Array SQL
443 |         let mut sats_source_sql_array: HashMap<String, String> = HashMap::new();
444 |         for (key, descriptors) in descriptors_for_sats.clone() {
445 |             let array_part_str = sats_source_sql_array.entry(key.clone()).or_insert_with(String::new);
446 |         
447 |             for descriptor in descriptors {
448 |                 if let Some(column) = descriptor.descriptor_link.source_column.as_ref() {
449 |                     let array_part = if array_part_str.is_empty() {
450 |                         format!("stg.{}::TEXT", column.column_name)
451 |                     } else {
452 |                         format!(", stg.{}::TEXT", column.column_name)
453 |                     };
454 |                     array_part_str.push_str(&array_part);
455 |                 }
456 |             }
457 |         }
458 | 
459 |         // Column SQL
460 |         let mut sats_source_sql_cols: HashMap<String, String> = HashMap::new();
461 |         for (key, descriptors) in descriptors_for_sats.clone() {
462 |             let col_part_str = sats_source_sql_cols.entry(key.clone()).or_insert_with(String::new);
463 |         
464 |             for descriptor in descriptors {
465 |                 if let Some(column) = descriptor.descriptor_link.source_column.as_ref() {
466 |                     let col_part = format!(r#",
467 |                                                     {}"#, 
468 |                                                     column.column_name);
469 |                     col_part_str.push_str(&col_part);
470 |                 }
471 |             }
472 |         }
473 | 
474 |         // Main Insert
475 | 
476 |         for (key, insert_header) in sat_insert_sql_header_parts {
477 |             
478 |             let sat_source_sql_array = sats_source_sql_array.get(&key).map(|v| v.as_str()).unwrap_or("NA");
479 |             let sat_source_sql_cols = sats_source_sql_cols.get(&key).map(|v| v.as_str()).unwrap_or("NA");
480 | 
481 |             // TODO: Change data structure to support multiple source schemas.
482 |             let source_schema_name = descriptors_for_sats
483 |                 .get(&key)
484 |                 .and_then(|v| v.get(0))  // Safely get the first element
485 |                 .and_then(|descriptor| descriptor.descriptor_link.source_column.as_ref())  // Safely access target_column
486 |                 .map(|source_column| source_column.schema_name.clone())  // Safely get schema_name and clone it
487 |                 .unwrap_or_default();  // Provide a default value in case of None
488 |             
489 |             let source_table_name = descriptors_for_sats
490 |                 .get(&key)
491 |                 .and_then(|v| v.get(0))  // Safely get the first element
492 |                 .and_then(|descriptor| descriptor.descriptor_link.source_column.as_ref())  // Safely access target_column
493 |                 .map(|source_column| source_column.table_name.clone())  // Safely get schema_name and clone it
494 |                 .unwrap_or_default();  // Provide a default value in case of None
495 | 
496 |             let business_key_name = &business_key.name;
497 | 
498 |             let insert_sql =  format!(r#"
499 |                 -- SAT INSERT SQL
500 |                 {insert_header}
501 |                 WITH stg AS (
502 |                 SELECT 
503 |                     *,
504 |                     auto_dw.hash(
505 |                         ARRAY_TO_STRING(ARRAY[{hub_bk_parts_sql_stg_array}], ',')
506 |                     ) AS hub_{business_key_name}_hk,
507 |                     auto_dw.hash(
508 |                         ARRAY_TO_STRING(ARRAY[{sat_source_sql_array}], ',')
509 |                     ) AS sat_{key}_hd
510 |                     FROM {source_schema_name}.{source_table_name} AS stg
511 |                 ),
512 |                 new_stg_data AS (  
513 |                 SELECT stg.*
514 |                     FROM stg
515 |                 LEFT JOIN {dw_schema}.sat_{key} ON 
516 |                     stg.hub_{business_key_name}_hk = sat_{key}.hub_{business_key_name}_hk AND
517 |                     stg.sat_{key}_hd = sat_{key}.sat_{key}_hd
518 |                 WHERE sat_{key}.hub_{business_key_name}_hk IS NULL
519 |                 )
520 |                 SELECT   
521 |                 hub_{business_key_name}_hk,
522 |                 (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP WITHOUT TIME ZONE AS load_ts ,
523 |                 '{source_schema_name}' AS record_source ,
524 |                 sat_{key}_hd
525 |                 {sat_source_sql_cols}
526 |                 FROM new_stg_data
527 |                 ; 
528 |                 "#);
529 | 
530 |             sat_insert_dmls.push_str(&insert_sql);
531 |         }
532 |     }
533 | 
534 |     sat_insert_dmls
535 | }
536 | 
537 | fn business_key_to_hub_dml(business_key: &BusinessKey, dw_schema_name: &String) -> String {
538 |     let mut hub_insert_dml = String::new();
539 | 
540 |     // Hub Buildout
541 |     let busines_key_name = &business_key.name;
542 | 
543 |     // Business Key Part(s)
544 |     let mut hub_bk_parts_sql = String::new();
545 |     for part_link in &business_key.business_key_part_links {
546 |         let r = format!(r#",
547 |                         {}_bk"#, part_link.alias);
548 |         hub_bk_parts_sql.push_str(&r);
549 |     }
550 | 
551 |     // INSERT INTO Header
552 |     let hub_insert_into_header_part_sql = format!(r#"
553 |         INSERT INTO {}.hub_{} (
554 |             hub_{}_hk,
555 |             load_ts,
556 |             record_source
557 |             {}
558 |         )
559 |         "#, 
560 |         dw_schema_name, busines_key_name, busines_key_name, hub_bk_parts_sql);
561 | 
562 | 
563 |     // Business Key Part(s) Init SQL
564 |     let mut hub_bk_neg_1_init_parts_sql = String::new();
565 |     let mut hub_bk_neg_2_init_parts_sql = String::new();
566 |     for part_link in &business_key.business_key_part_links {
567 |         let neg_1: String = format!(r#",
568 |             '-1'::TEXT AS {}_bk"#, part_link.alias);
569 |         hub_bk_neg_1_init_parts_sql.push_str(&neg_1);
570 |         let neg_2: String = format!(r#",
571 |             '-2'::TEXT AS {}_bk"#, part_link.alias);
572 |         hub_bk_neg_2_init_parts_sql.push_str(&neg_2);
573 |     }
574 |             
575 |     let hub_insert_into_init_part_sql = format!(r#"
576 |         WITH initialized AS (
577 |         SELECT
578 |         CASE
579 |             WHEN COUNT(*) > 0 THEN TRUE
580 |             ELSE FALSE
581 |         END is_initialized
582 |         FROM {dw_schema_name}.hub_{busines_key_name}
583 |         )
584 |         SELECT
585 |             auto_dw.hash(ARRAY_TO_STRING(ARRAY[-1], ',')::TEXT) AS hub_{busines_key_name}_hk,
586 |             '0001-01-01'::TIMESTAMP WITHOUT TIME ZONE AS load_ts, 
587 |             'SYSTEM'::TEXT AS record_source
588 |             {hub_bk_neg_1_init_parts_sql}
589 |             FROM initialized WHERE NOT initialized.is_initialized
590 |         UNION
591 |         SELECT
592 |             auto_dw.hash(ARRAY_TO_STRING(ARRAY[-2], ',')::TEXT) AS hub_{busines_key_name}_hk,
593 |             '0001-01-01'::TIMESTAMP WITHOUT TIME ZONE AS load_ts,
594 |             'SYSTEM'::TEXT AS record_source
595 |             {hub_bk_neg_2_init_parts_sql}
596 |             FROM initialized WHERE NOT initialized.is_initialized
597 |         ;
598 |         "#);
599 | 
600 |     let hub_insert_init = hub_insert_into_header_part_sql.clone() + &hub_insert_into_init_part_sql;
601 |     hub_insert_dml.push_str(&hub_insert_init);
602 | 
603 |     // Insert Main
604 | 
605 |     // Arrary Parts
606 |     let mut hub_bk_parts_sql_stg_array = String ::new();
607 |     for part_link in &business_key.business_key_part_links {
608 |         // TODO: Need acount for more than once source.  However, Vec data structure isn't ideal - refactor. 
609 |         let e = format!(r#"stg.{}::TEXT,"#, part_link.source_columns[0].column_name);
610 |         hub_bk_parts_sql_stg_array.push_str(&e);
611 |     } 
612 |     hub_bk_parts_sql_stg_array.pop(); // Removing the last ","
613 | 
614 |     // Source Schema
615 |     let mut source_schema = String::new();
616 |     let mut source_table = String::new();
617 | 
618 |     // Business Key Part(s)
619 |     let mut hub_bk_parts_stg_names = String::new();
620 |     for part_link in &business_key.business_key_part_links {
621 |         let source_column_name = &part_link.source_columns[0].column_name;
622 |         let source_column_alias = &part_link.alias;
623 |         let e = format!(r#",stg.{source_column_name}::TEXT AS {source_column_alias}_bk"#);
624 |         hub_bk_parts_stg_names.push_str(&e);
625 |         source_schema = part_link.source_columns[0].schema_name.clone();
626 |         source_table = part_link.source_columns[0].table_name.clone();
627 |     }
628 | 
629 |     let hub_insert_into_main_part_sql = format!(r#"
630 |         WITH
631 |         stg_data AS (
632 |         SELECT
633 |             auto_dw.hash(
634 |                         ARRAY_TO_STRING(ARRAY[{hub_bk_parts_sql_stg_array}], ',')
635 |                     ) AS hub_{busines_key_name}_hk,
636 |             (CURRENT_TIMESTAMP AT TIME ZONE 'UTC')::TIMESTAMP(6) AS load_ts,
637 |             '{source_schema}' AS record_source
638 |             {hub_bk_parts_stg_names}
639 |         FROM {source_schema}.{source_table} AS stg
640 |         ),
641 |         new_stg_data AS (
642 |         SELECT stg_data.* FROM stg_data
643 |         LEFT JOIN {dw_schema_name}.hub_{busines_key_name} ON stg_data.hub_{busines_key_name}_hk = hub_{busines_key_name}.hub_{busines_key_name}_hk
644 |         WHERE hub_{busines_key_name}.hub_{busines_key_name}_hk IS NULL
645 |         )
646 |         SELECT
647 |         hub_{busines_key_name}_hk,
648 |         load_ts,
649 |         record_source{hub_bk_parts_sql}
650 |         FROM new_stg_data
651 |         ;
652 |         "#
653 |     );
654 | 
655 |     let hub_insert_main = hub_insert_into_header_part_sql + &hub_insert_into_main_part_sql;
656 |     hub_insert_dml.push_str(&hub_insert_main);
657 | 
658 |     hub_insert_dml
659 | }
660 | 


--------------------------------------------------------------------------------
/extension/src/controller/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod bgw_init;
2 | pub mod bgw_source_objects;
3 | pub mod bgw_transformer_client;
4 | pub mod dv_builder;
5 | pub mod dv_loader;


--------------------------------------------------------------------------------
/extension/src/lib.rs:
--------------------------------------------------------------------------------
  1 | mod controller; // Coordinates application logic and model-service interactions.
  2 | mod model;      // Defines data structures and data-related methods.
  3 | mod utility;    // Initialization, Configuration Management, and External Services
  4 | 
  5 | use controller::dv_loader;
  6 | pub use pgrx::prelude::*;
  7 | use utility::guc;
  8 | use uuid::Uuid;
  9 | 
 10 | use sha2::{Sha256, Digest};
 11 | use hex;
 12 | 
 13 | pgrx::pg_module_magic!();
 14 | 
 15 | use model::queries;
 16 | 
 17 | #[pg_extern(name="go")]
 18 | fn go_default() -> String {
 19 |     let accepted_transformer_confidence_level: String = 
 20 |         utility::guc::get_guc(guc::PgAutoDWGuc::AcceptedTransformerConfidenceLevel)
 21 |             .unwrap_or_else(|| {
 22 |                 error!("GUC: Unable to obtain parameter \"pg_auto_dw.accepted_transformer_confidence_level.\"");
 23 |             });
 24 |     let build_id = Uuid::new_v4();
 25 |     let message = format!("Build ID: {} | Data warehouse tables are currently being built.", build_id);
 26 |     info!("{}", message);
 27 |     let build_flag = "Build";
 28 |     let build_status = "RTD";
 29 |     let status = "Ready to Deploy";
 30 |     let query_insert = &queries::insert_into_build_call(
 31 |         &build_id.to_string(), &build_flag, &build_status, &status, &accepted_transformer_confidence_level);
 32 |     _ = Spi::run(query_insert);
 33 |     let query_build_pull = &queries::build_object_pull(&build_id.to_string());
 34 |     let load_data = true;
 35 |     controller::dv_builder::build_dv(build_id, query_build_pull, load_data);
 36 |     message
 37 | }
 38 | 
 39 | #[pg_extern(name="build")]
 40 | fn build_default() -> String {
 41 |     let accepted_transformer_confidence_level: String = 
 42 |         utility::guc::get_guc(guc::PgAutoDWGuc::AcceptedTransformerConfidenceLevel)
 43 |             .unwrap_or_else(|| {
 44 |                 error!("GUC: Unable to obtain parameter \"pg_auto_dw.accepted_transformer_confidence_level.\"");
 45 |             });
 46 |     let build_id = Uuid::new_v4();
 47 |     let message = format!("Build ID: {} | Data warehouse tables are currently being built.", build_id);
 48 |     info!("{}", message);
 49 |     let build_flag = "Build";
 50 |     let build_status = "RTD";
 51 |     let status = "Ready to Deploy";
 52 |     let query_insert = &queries::insert_into_build_call(
 53 |         &build_id.to_string(), &build_flag, &build_status, &status, &accepted_transformer_confidence_level);
 54 |     _ = Spi::run(query_insert);
 55 |     let query_build_pull = &queries::build_object_pull(&build_id.to_string());
 56 |     let load_data = false;
 57 |     controller::dv_builder::build_dv(build_id, query_build_pull, load_data);
 58 |     message
 59 | }
 60 | 
 61 | // Syncing All DV Schemas
 62 | #[pg_extern(name="sync")]
 63 | fn sync_default() -> String {
 64 |     let load_complete = dv_loader::dv_load_schemas_all();
 65 |     if load_complete {
 66 |         "All DV schema objects updated.".to_string()
 67 |     } else {
 68 |         "Failed Load".to_string()
 69 |     }
 70 | }
 71 | 
 72 | #[pg_extern]
 73 | fn source_include(  schema_pattern_include: &str, 
 74 |                     table_pattern_include: default!(Option<&str>, "NULL"), 
 75 |                     column_pattern_include: default!(Option<&str>, "NULL")) -> &'static str {
 76 |     // Include Patterns
 77 |     let schema_pattern_include: &str = schema_pattern_include;
 78 |     let table_pattern_include: &str = table_pattern_include.unwrap_or(".*");
 79 |     let column_pattern_include: &str = column_pattern_include.unwrap_or(".*");
 80 |     // Exclude Patterns
 81 |     let schema_pattern_exclude: &str = "a^";
 82 |     let table_pattern_exclude: &str = "a^";
 83 |     let column_pattern_exclude: &str = "a^";
 84 |     _ = Spi::run(queries::source_object_dw( schema_pattern_include, 
 85 |                                             table_pattern_include, 
 86 |                                             column_pattern_include, 
 87 |                                             schema_pattern_exclude, 
 88 |                                             table_pattern_exclude, 
 89 |                                             column_pattern_exclude)
 90 |                                             .as_str());
 91 |     "Pattern Included"
 92 | }
 93 | 
 94 | #[pg_extern]
 95 | fn source_exclude(   schema_pattern_exclude: &str, 
 96 |                     table_pattern_exclude: default!(Option<&str>, "NULL"), 
 97 |                     column_pattern_exclude: default!(Option<&str>, "NULL")) -> &'static str {
 98 |     let schema_pattern_include: &str = "a^";
 99 |     let table_pattern_include: &str = "a^";
100 |     let column_pattern_include: &str = "a^";
101 |     let schema_pattern_exclude: &str = schema_pattern_exclude;
102 |     let table_pattern_exclude: &str = table_pattern_exclude.unwrap_or(".*");
103 |     let column_pattern_exclude: &str = column_pattern_exclude.unwrap_or(".*");
104 |     _ = Spi::run(queries::source_object_dw( schema_pattern_include, 
105 |                                             table_pattern_include, 
106 |                                             column_pattern_include, 
107 |                                             schema_pattern_exclude, 
108 |                                             table_pattern_exclude, 
109 |                                             column_pattern_exclude)
110 |                                             .as_str());
111 |     "Pattern Excluded"
112 | }
113 | 
114 | #[pg_extern]
115 | fn source_column() -> Result<
116 |     TableIterator<
117 |         'static,
118 |         (
119 |             name!(schema, Option<String>),
120 |             name!(table, Option<String>),
121 |             name!(column, Option<String>),
122 |             name!(status, Option<String>),
123 |             name!(category, Option<String>),
124 |             name!(is_sensitive, Option<String>),
125 |             name!(confidence_level, Option<String>),
126 |             name!(status_response, Option<String>),
127 |         )
128 |     >,
129 |     spi::Error,
130 | > {
131 |     let accepted_transformer_confidence_level: String = 
132 |         utility::guc::get_guc(guc::PgAutoDWGuc::AcceptedTransformerConfidenceLevel)
133 |             .unwrap_or_else(|| {
134 |                 error!("GUC: Unable to obtain parameter \"pg_auto_dw.accepted_transformer_confidence_level.\"");
135 |             });
136 | 
137 |     let query: &str = &queries::source_column(&accepted_transformer_confidence_level);
138 | 
139 |     info!("Evaluation of TABLE customer");
140 |     Spi::connect(|client| {
141 |         Ok(client
142 |             .select(query, None, None)?
143 |             .map(|row| {
144 |                 (
145 |                     row["schema"].value().ok().flatten(),
146 |                     row["table"].value().ok().flatten(),
147 |                     row["column"].value().ok().flatten(),
148 |                     row["status"].value().ok().flatten(),
149 |                     row["category"].value().ok().flatten(),
150 |                     row["is_sensitive"].value().ok().flatten(),
151 |                     row["confidence_level"].value().ok().flatten(),
152 |                     row["status_response"].value().ok().flatten(),
153 |                 )
154 |             })
155 |             .collect::<Vec<_>>())
156 |     })
157 |     .map(TableIterator::new)
158 | }
159 | 
160 | #[pg_extern(immutable, parallel_safe)]
161 | fn hash(input: &str) -> String {
162 |     let digest = Sha256::digest(input.as_bytes());
163 |     hex::encode(digest)
164 | }
165 | 
166 | #[cfg(any(test, feature = "pg_test"))]
167 | #[pg_schema]
168 | mod tests {
169 |     use pgrx::prelude::*;
170 | 
171 |     // TODO: Unit Testing
172 |     #[pg_test]
173 |     fn test_go_default() {
174 |     }
175 | 
176 | }
177 | 
178 | /// This module is required by `cargo pgrx test` invocations.
179 | /// It must be visible at the root of your extension crate.
180 | #[cfg(test)]
181 | pub mod pg_test {
182 |     pub fn setup(_options: Vec<&str>) {
183 |         // perform one-off initialization when the pg_test framework starts
184 |     }
185 | 
186 |     pub fn postgresql_conf_options() -> Vec<&'static str> {
187 |         // return any postgresql.conf settings that are required for your tests
188 |         vec![]
189 |     }
190 | }
191 | 


--------------------------------------------------------------------------------
/extension/src/model/dv_schema.rs:
--------------------------------------------------------------------------------
  1 | use chrono::NaiveDateTime;
  2 | use pgrx::pg_sys::Oid;
  3 | use serde::{Deserialize, Serialize};
  4 | use uuid::Uuid;
  5 | 
  6 | #[derive(Serialize, Deserialize, Debug)]
  7 | pub struct DVSchema {
  8 |     #[serde(rename = "ID")]
  9 |     pub id: Uuid,
 10 |     #[serde(rename = "DW Schema")]
 11 |     pub dw_schema: String,
 12 |     #[serde(rename = "Create Date")]
 13 |     pub create_timestamp_gmt: NaiveDateTime,
 14 |     #[serde(rename = "Modified Date")]
 15 |     pub modified_timestamp_gmt: NaiveDateTime,
 16 |     #[serde(rename = "Business Keys")]
 17 |     pub business_keys: Vec<BusinessKey>,
 18 |     #[serde(rename = "Link Keys")]
 19 |     pub link_keys: Vec<LinkKey>,
 20 | }
 21 | 
 22 | impl DVSchema {
 23 |     pub fn get_columns(&self) -> Vec<(Oid, i16)> {
 24 |         let mut columns: Vec<(Oid, i16)> = Vec::new();
 25 |         for link_key in &self.link_keys {
 26 |             for business_key in &link_key.business_keys {
 27 |                 columns.append(&mut business_key.get_columns());
 28 |             }
 29 | 
 30 |             for descriptor in &link_key.descriptors {
 31 |                 if let Some(source_column) = &descriptor.descriptor_link.source_column {
 32 |                     columns.push(source_column.get_column());
 33 |                 }
 34 |             }
 35 |         }
 36 | 
 37 |         for business_key in &self.business_keys {
 38 |             columns.append(&mut business_key.get_columns());
 39 |         }
 40 |         columns
 41 |     }
 42 | }
 43 | 
 44 | #[derive(Serialize, Deserialize, Debug)]
 45 | pub struct LinkKey {
 46 |     #[serde(rename = "ID")]
 47 |     pub id: Uuid,
 48 |     #[serde(rename = "Name")]
 49 |     pub name: String,
 50 |     #[serde(rename = "Business Keys")]
 51 |     pub business_keys: Vec<BusinessKey>,
 52 |     #[serde(rename = "Descriptors")]
 53 |     pub descriptors: Vec<Descriptor>, // Commonly multiple descriptor values, but may also contain none
 54 | }
 55 | 
 56 | #[derive(Serialize, Deserialize, Debug)]
 57 | pub struct BusinessKey {
 58 |     #[serde(rename = "ID")]
 59 |     pub id: Uuid,
 60 |     #[serde(rename = "Name")]
 61 |     pub name: String,
 62 |     #[serde(rename = "Business Key Part Links")]
 63 |     pub business_key_part_links: Vec<BusinessKeyPartLink>,
 64 |     #[serde(rename = "Descriptors")]
 65 |     pub descriptors: Vec<Descriptor>, // Commonly multiple descriptor values, but may also contain none
 66 | }
 67 | 
 68 | #[derive(Serialize, Deserialize, Debug)]
 69 | pub struct BusinessKeyPartLink {
 70 |     #[serde(rename = "ID")]
 71 |     pub id: Uuid,
 72 |     #[serde(rename = "Alias")]
 73 |     pub alias: String,
 74 |     #[serde(rename = "Source Column Data")]
 75 |     pub source_columns: Vec<ColumnData>,
 76 |     #[serde(rename = "Hub Target Column Data")]
 77 |     pub hub_target_column: Option<ColumnData>,
 78 | }
 79 | 
 80 | impl BusinessKey {
 81 |     pub fn get_columns(&self) -> Vec<(Oid, i16)> {
 82 |         let mut columns: Vec<(Oid, i16)> = Vec::new();
 83 |         // BK Part Search
 84 |         for bkp_link in &self.business_key_part_links {
 85 |             for source_column in &bkp_link.source_columns {
 86 |                 columns.push(source_column.get_column());
 87 |             }
 88 |         }
 89 |         // Descriptor Search
 90 |         for descriptor in &self.descriptors {
 91 |             if let Some(source_column) = &descriptor.descriptor_link.source_column {
 92 |                 columns.push(source_column.get_column());
 93 |             }
 94 |         }
 95 |         columns
 96 |     }
 97 | }
 98 | 
 99 | #[derive(Serialize, Deserialize, Debug)]
100 | pub struct Descriptor {
101 |     #[serde(rename = "ID")]
102 |     pub id: Uuid,
103 |     #[serde(rename = "Descriptor Link")]
104 |     pub descriptor_link: DescriptorLink,
105 |     #[serde(rename = "Orbit")]
106 |     pub orbit: String,
107 |     #[serde(rename = "Is Sensitive")]
108 |     pub is_sensitive: bool,
109 | }
110 | 
111 | #[derive(Serialize, Deserialize, Debug)]
112 | pub struct DescriptorLink {
113 |     #[serde(rename = "ID")]
114 |     pub id: Uuid,
115 |     #[serde(rename = "Alias")]
116 |     pub alias: String,
117 |     #[serde(rename = "Source Column Data")]
118 |     pub source_column: Option<ColumnData>,
119 |     #[serde(rename = "Target Column Data")]
120 |     pub target_column: Option<ColumnData>,
121 | }
122 | 
123 | #[derive(Serialize, Deserialize, Debug)]
124 | pub struct ColumnData {
125 |     #[serde(rename = "ID")]
126 |     pub id: Uuid,
127 |     #[serde(rename = "System ID")]
128 |     pub system_id: i64,
129 |     #[serde(rename = "Schema Name")]
130 |     pub schema_name: String,
131 |     #[serde(rename = "Table OID")]
132 |     pub table_oid: Oid,
133 |     #[serde(rename = "Table Name")]
134 |     pub table_name: String,
135 |     #[serde(rename = "Column Name")]
136 |     pub column_name: String,
137 |     #[serde(rename = "Column Ordinal Position")]
138 |     pub column_ordinal_position: i16,
139 |     #[serde(rename = "Column Type")]
140 |     pub column_type_name: String,
141 | }
142 | 
143 | impl ColumnData {
144 |     pub fn get_column(&self) -> (Oid, i16) {
145 |         (self.table_oid, self.column_ordinal_position)
146 |     }
147 | }
148 | 


--------------------------------------------------------------------------------
/extension/src/model/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod source_objects;
2 | pub mod dv_schema;
3 | pub mod queries;
4 | pub mod prompt_template;


--------------------------------------------------------------------------------
/extension/src/model/prompt_template.rs:
--------------------------------------------------------------------------------
  1 | #[derive(Debug)]
  2 | pub enum PromptTemplate {
  3 |     BKComponentIdentification,
  4 |     BKName,
  5 |     DescriptorSensitive,
  6 | }
  7 | 
  8 | impl PromptTemplate {
  9 |   pub fn template(&self) -> &str {
 10 |       match self {
 11 |           PromptTemplate::BKComponentIdentification => r#"
 12 |             Task Title: Business Key Component Identification by Column in JSON Source Table Object
 13 | 
 14 |             You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to evaluate if a specified column is a business key component and, if so, how likely it is. The results of your evaluations will be used to create downstream data vault structures.
 15 | 
 16 |             A business key component is an attribute that forms part of a business key, which may be either a component of a composite key or a single key that uniquely identifies the record set. Additionally, there may be multiple business keys within one table.
 17 | 
 18 |             Requested Task:
 19 | 
 20 |             Determine whether the specified column, identified by its column number (“column no”), is likely to represent a business key or a component of a business key.
 21 | 
 22 |             Request Details:
 23 | 
 24 |             If the column is a primary key, as indicated in the comments or column details, assume it is a business key component. However, this does not exclude the possibility of other business key components within the table, but it may reduce the likelihood of the specified column being the only business key.
 25 | 
 26 |             If the specified column could be categorized as an email or username, only consider it a business key component if there are no other attributes in the table that could reasonably serve as a business key component.
 27 | 
 28 |             Use the column comments, when available, as the primary source of definition, providing direct context from business users. These comments should take priority over the column’s name or data type in determining its purpose and usage. 
 29 | 
 30 |             Confidence Value:
 31 | 
 32 |             Provide a confidence score between 0 and 1, rounded to two decimal places, representing your confidence in the likelihood that the column is a business key component. A value of 0.80 or higher is considered reasonably confident.
 33 | 
 34 |             Reason:
 35 | 
 36 |             Indicate why you made the decision you did.
 37 | 
 38 |             Output:
 39 | 
 40 |             Ensure the output conforms to the format shown in the examples below.
 41 | 
 42 |             Example Input 1)
 43 |             JSON Source Table Object:
 44 |             {
 45 |               "Schema Name": "public",
 46 |               "Table Name": "customer",
 47 |               "Column Details": [
 48 |                 "Column No: 1 Named: customer_id of type: uuid And is a primary key.  Column Comments: NA",
 49 |                 "Column No: 2 Named: city of type: character varying(255) Column Comments: NA",
 50 |                 "Column No: 3 Named: state of type: character(2) Column Comments: NA",
 51 |                 "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA"
 52 |               ]
 53 |             }
 54 | 
 55 |             Column No: 1
 56 | 
 57 |             Example Output 1)
 58 |             {
 59 |               "Business Key Component Identification": {
 60 |                 "Is Business Key Component": true,
 61 |                 "Confidence Value": 0.95,
 62 |                 "Reason": "The 'customer_id' column is designated as the primary key, which is typically the best candidate for a business key component in the 'customer' table."
 63 |               }
 64 |             }
 65 | 
 66 |             Example Input 2)
 67 |             JSON Source Table Object:
 68 |             {
 69 |               "Schema Name": "sales",
 70 |               "Table Name": "order_details",
 71 |               "Column Details": [
 72 |                 "Column No: 1 Named: id of type: integer Column Comments: NA",
 73 |                 "Column No: 2 Named: product_id of type: integer Column Comments: NA",
 74 |                 "Column No: 3 Named: quantity of type: integer Column Comments: NA",
 75 |                 "Column No: 4 Named: order_date of type: date Column Comments: NA"
 76 |               ]
 77 |             }
 78 | 
 79 |             Column No: 1
 80 | 
 81 |             Example Output 2)
 82 |             {
 83 |               "Business Key Component Identification": {
 84 |                 "Is Business Key Component": true,
 85 |                 "Confidence Value": 0.75,
 86 |                 "Reason": "Although 'id' is not explicitly marked as a primary key, it is likely to uniquely identify each order detail, making it a strong candidate for a business key component."
 87 |               }
 88 |             }
 89 | 
 90 |             Example Input 3)
 91 |             JSON Source Table Object:
 92 |             {
 93 |               "Schema Name": "sales",
 94 |               "Table Name": "order_details",
 95 |               "Column Details": [
 96 |                 "Column No: 1 Named: order_id of type: integer Column Comments: NA",
 97 |                 "Column No: 2 Named: product_id of type: integer Column Comments: NA",
 98 |                 "Column No: 3 Named: quantity of type: integer Column Comments: NA",
 99 |                 "Column No: 4 Named: order_date of type: date Column Comments: NA"
100 |               ]
101 |             }
102 | 
103 |             Column No: 1
104 | 
105 |             Example Output 3)
106 |             {
107 |               "Business Key Component Identification": {
108 |                 "Is Business Key Component": true,
109 |                 "Confidence Value": 0.85,
110 |                 "Reason": "The 'order_id' column likely represents the primary identifier for each order within the 'order_details' table. Although it is not explicitly marked as a primary key, 'order_id' is a common identifier for business entities, making it a strong candidate for a business key component."
111 |               }
112 |             }
113 | 
114 |             Example Input 4)
115 |             JSON Source Table Object:
116 |             {
117 |               "Schema Name": "sales",
118 |               "Table Name": "order_details",
119 |               "Column Details": [
120 |                 "Column No: 1 Named: order_id of type: integer Column Comments: NA",
121 |                 "Column No: 2 Named: product_id of type: integer Column Comments: NA",
122 |                 "Column No: 3 Named: quantity of type: integer Column Comments: NA",
123 |                 "Column No: 4 Named: order_date of type: date Column Comments: NA"
124 |               ]
125 |             }
126 | 
127 |             Column No: 2
128 | 
129 | 
130 |             Example Output 4)
131 |             {
132 |               "Business Key Component Identification": {
133 |                 "Is Business Key Component": true,
134 |                 "Confidence Value": 0.80,
135 |                 "Reason": "'product_id' likely represents a key component that helps identify specific products associated with the order. It is not the sole key but may serve as part of a composite business key alongside 'order_id'."
136 |               }
137 |             }
138 | 
139 | 
140 |             Example Input 5)
141 |             JSON Source Table Object:
142 |             {
143 |               "Schema Name": "sales",
144 |               "Table Name": "order_details",
145 |               "Column Details": [
146 |                 "Column No: 1 Named: order_id of type: integer Column Comments: NA",
147 |                 "Column No: 2 Named: product_id of type: integer Column Comments: NA",
148 |                 "Column No: 3 Named: quantity of type: integer Column Comments: NA",
149 |                 "Column No: 4 Named: order_date of type: date Column Comments: NA"
150 |               ]
151 |             }
152 | 
153 |             Column No: 3
154 | 
155 |             Example Output 5)
156 |             {
157 |               "Business Key Component Identification": {
158 |                 "Is Business Key Component": false,
159 |                 "Confidence Value": 0.30,
160 |                 "Reason": "The 'quantity' column represents a numeric value related to the number of products in the order, but it does not uniquely identify the record. It is unlikely to serve as a business key component."
161 |               }
162 |             }
163 | 
164 | 
165 |             Example Input 6)
166 |             JSON Source Table Object:
167 |             {
168 |               "Schema Name": "sales",
169 |               "Table Name": "order_details",
170 |               "Column Details": [
171 |                 "Column No: 1 Named: order_id of type: integer Column Comments: NA",
172 |                 "Column No: 2 Named: product_id of type: integer Column Comments: NA",
173 |                 "Column No: 3 Named: quantity of type: integer Column Comments: NA",
174 |                 "Column No: 4 Named: order_date of type: date Column Comments: NA"
175 |               ]
176 |             }
177 | 
178 |             Column No: 4
179 | 
180 |             Example Output 6)
181 |             {
182 |               "Business Key Component Identification": {
183 |                 "Is Business Key Component": false,
184 |                 "Confidence Value": 0.40,
185 |                 "Reason": "'order_date' represents the date on which the order was placed. While it provides important context, it is not unique to the record and is therefore unlikely to serve as a business key component."
186 |               }
187 |             }
188 | 
189 |             Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints}
190 | 
191 |             JSON Source Table Object: {new_json}
192 | 
193 |             Column No: {column_no}
194 |             "#,
195 |           PromptTemplate::BKName => r#"
196 |             Task Title: Business Key Naming in JSON Source Table Object with specified Column
197 | 
198 |             You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your responses to requested tasks will be used to help create downstream data vault tables.
199 | 
200 |             Requested Task: Identify the business key name.  The business key part column has already been identified, and its associated column number, “column no”, will be provided along with the JSON Source Table Object.  Return a name that best represents the business key from a data vault perspective.
201 | 
202 |             Request Details:
203 | 
204 |             The Business Key Name should be crafted based on the attribute linked to the business key, as identified by the provided column number. Prioritize the attribute name over the table name if the attribute name is descriptive enough. It should clearly represent the core business entity, avoiding generic terms like “ID,” “number,” or “Entity.” The name should focus solely on the business aspect, using terms like “customer,” “employee,” or “seller” that directly reflect the entity’s purpose, without unnecessary suffixes or identifiers. If the attribute associated with the business key or its column comments are not descriptive enough, the table name or schema name can be used to help formulate the Business Key Name.
205 | 
206 |             Use the column comments, when available, as the primary source of definition, providing direct context from business users. These comments should take priority over the column’s name or data type in determining its business key name. 
207 | 
208 |             Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your chosen Business Key Name. A value of 0.80 or higher is considered reasonably confident.
209 | 
210 |             Reason: Indicate why you made the decision you did.
211 | 
212 |             Output: Ensure the output conforms to the format shown in the examples below.
213 | 
214 |             Example Input 1)
215 |             JSON Source Table Object:
216 |             {
217 |               "Schema Name": "public",
218 |               "Table Name": "customer",
219 |               "Column Details": [
220 |                 "Column No: 1 Named: customer_id of type: uuid And is a primary key.  Column Comments: NA",
221 |                 "Column No: 2 Named: city of type: character varying(255) Column Comments: NA",
222 |                 "Column No: 3 Named: state of type: character(2) Column Comments: NA",
223 |                 "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA"
224 |               ]
225 |             }
226 | 
227 |             Column No: 1
228 | 
229 |             Example Output 1)
230 |             {
231 |               "Business Key Name": {
232 |                 "Name": "Customer",
233 |                 "Confidence Value": 0.9,
234 |                 "Reason": "The column 'customer_id' is a primary key and represents the unique identifier for customers in the 'customer' table. Given that the table name 'customer' directly reflects the business entity, 'Customer' is chosen as the Business Key Name. The confidence value is high because the identifier is straightforward and strongly aligned with the core business entity."
235 |               }
236 |             }
237 | 
238 |             Example Input 2)
239 |             JSON Source Table Object:
240 |             {
241 |               "Schema Name": "sales",
242 |               "Table Name": "order_details",
243 |               "Column Details": [
244 |                 "Column No: 1 Named: id of type: integer Column Comments: NA",
245 |                 "Column No: 2 Named: product_id of type: integer Column Comments: NA",
246 |                 "Column No: 3 Named: quantity of type: integer Column Comments: NA",
247 |                 "Column No: 4 Named: order_date of type: date Column Comments: NA"
248 |               ]
249 |             }
250 | 
251 |             Column No: 1
252 | 
253 |             Example Output 2)
254 |             {
255 |               "Business Key Name": {
256 |                 "Name": "Order",
257 |                 "Confidence Value": 0.85,
258 |                 "Reason": "The column 'id' is a primary key and serves as the unique identifier for records in the 'order_details' table. Although the column name 'id' is generic, the table name 'order_details' indicates that the records pertain to individual orders. Therefore, 'Order' is chosen as the Business Key Name to best represent the core business entity. The confidence value is slightly lower due to the generic nature of the column name, but it is still reasonably confident given the context provided by the table name."
259 |               }
260 |             }
261 | 
262 |             Now, based on the instructions and examples above, please generate the JSON output for the following input. {hints}
263 | 
264 |             JSON Source Table Object: {new_json}
265 | 
266 |             Column No: {column_no}
267 |             "#,
268 |           PromptTemplate::DescriptorSensitive => r#"
269 |             Task Title: Identification of PII in JSON Source Table Object
270 | 
271 |             You have a JSON Source Table Object that includes the schema name, table name, and detailed column information. Your task is to assist in the creation of downstream data vault tables by performing the requested tasks based on this information.
272 | 
273 |             Requested Task: Identify if the descriptor is a descriptor sensitive PII subtype. A descriptor column, along with its associated column number (“column no”), will be provided in the JSON Source Table Object. If you determine that the column contains Personally Identifiable Information (PII), categorize it as “Descriptor - Sensitive.”
274 | 
275 |             Request Details:
276 |             PII Identification: Only consider a column as PII if it directly matches an item from the PII list provided below. Do not infer or project beyond this list. If a column name or its associated comment closely resembles an item from the list, classify it as PII.
277 |             No Overgeneralization: Avoid overgeneralization or inference beyond what is explicitly stated in the list. Focus strictly on the provided PII list.
278 | 
279 |             Use the column comments, when available, as the primary source of definition, providing direct context from business users. These comments should take priority over the column’s name or data type in determining its identity. 
280 | 
281 |             Personal Identifiable Information (PII) List:
282 | 
283 |             Consider any of the following types of information as PII and categorize the corresponding column as “Descriptor - Sensitive”:
284 | 
285 |             - Person’s Name: PII (Includes first name, last name, or both).
286 |             - Social Security Number (SSN): PII
287 |             - Driver’s License Number: PII
288 |             - Passport Number: PII
289 |             - Email Address: PII
290 |             - Physical Street Address: PII (Includes street address, but excludes City, State, or standard 5-digit Zip code).
291 |             - Extended Zip Code: PII (Any Zip code with more than 5 digits).
292 |             - Telephone Number: PII (Includes both landline and mobile numbers).
293 |             - Date of Birth: PII
294 |             - Place of Birth: PII
295 |             - Biometric Data: PII (Includes fingerprints, facial recognition data, iris scans).
296 |             - Medical Information: PII (Includes health records, prescriptions).
297 |             - Financial Information: PII (Includes bank account numbers, credit card numbers, debit card numbers).
298 |             - Employment Information: PII (Includes employment records, salary information).
299 |             - Insurance Information: PII (Includes policy numbers, claim information).
300 |             - Education Records: PII (Includes student records, transcripts).
301 |             - Online Identifiers: PII (Includes usernames, IP addresses, cookies, MAC addresses).
302 |             - Photographs or Videos: PII (Any media that can identify an individual).
303 |             - National Identification Numbers: PII (Includes identifiers outside of SSN, such as National Insurance Numbers in the UK).
304 |             - Geolocation Data: PII (Includes GPS coordinates, location history).
305 |             - Vehicle Registration Numbers: PII
306 | 
307 |             Not PII:
308 | 
309 |             Some data may seem personally identifiable; however, it is not specific enough to identify an individual.
310 | 
311 |             - Standard 5-Digit Zip Code: Not PII
312 |             - City: Not PII
313 |             - State: Not PII
314 |             - Country: Not PII
315 |             - Age (in years): Not PII (Unless combined with other identifiers like date of birth).
316 |             - Date or Timestamp (Example: created_date, created_timestamp, update_Date, update_timestamp): Not PII (Unless combined with other identiviers like date of birth)
317 |             - Gender: Not PII
318 |             - Ethnicity/Race: Not PII (General categories, e.g., “Caucasian,” “Asian,” without additional identifiers).
319 |             - Publicly Available Information: Not PII (Any information that is lawfully made available from federal, state, or local government records).
320 |             - Generic Job Titles: Not PII (Titles like “Manager,” “Engineer,” without additional identifying details).
321 |             - Company/Organization Name: Not PII (Names of companies or organizations without personal identifiers).
322 | 
323 |             Confidence Value: Provide a score between 0 and 1, rounded to two decimal places, representing your confidence in your “Is PII” determination of true or false. A value of 0.80 or higher is considered reasonably confident in your true or false answer.
324 | 
325 | 
326 |             Reason: Indicate why you made the decision you did.
327 | 
328 |             Output: Please ensure that your output is JSON and matches the structure of the output examples provided.
329 | 
330 |             Example Input 1)
331 |             JSON Source Table Object:
332 |             {
333 |               "Schema Name": "public",
334 |               "Table Name": "customer",
335 |               "Column Details": [
336 |                 "Column No: 1 Named: customer_id of type: uuid And is a primary key.  Column Comments: NA",
337 |                 "Column No: 2 Named: city of type: character varying(255) Column Comments: NA",
338 |                 "Column No: 3 Named: state of type: character(2) Column Comments: NA",
339 |                 "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA"
340 |               ]
341 |             }
342 | 
343 |             Column No: 4
344 | 
345 |             Example Output 1)
346 |             {
347 |               "Descriptor - Sensitive": {
348 |                 "Is PII": true,
349 |                 "Confidence Value": 0.85,
350 |                 "Reason": "The 'zip' column is identified as PII because its data type, character varying(10), allows for the possibility of storing extended zip codes, which matches an item on the provided PII list."
351 |               }
352 |             }
353 | 
354 |             Example Input 2)
355 |             JSON Source Table Object:
356 |             {
357 |               "Schema Name": "public",
358 |               "Table Name": "customer",
359 |               "Column Details": [
360 |                 "Column No: 1 Named: customer_id of type: uuid And is a primary key.  Column Comments: NA",
361 |                 "Column No: 2 Named: city of type: character varying(255) Column Comments: NA",
362 |                 "Column No: 3 Named: state of type: character(2) Column Comments: NA",
363 |                 "Column No: 4 Named: zip of type: character varying(10) Column Comments: NA"
364 |               ]
365 |             }
366 | 
367 |             Column No: 2
368 | 
369 |             Example Output 2)
370 |             {
371 |               "Descriptor - Sensitive": {
372 |                 "Is PII": false,
373 |                 "Confidence Value": 0.90,
374 |                 "Reason": "The 'city' column is not considered PII because city names do not match any item on the provided PII list."
375 |               }
376 |             }
377 | 
378 |             Example Input 3)
379 |             JSON Source Table Object:
380 |             {
381 |               "Schema Name": "public",
382 |               "Table Name": "employee",
383 |               "Column Details": [
384 |                 "Column No: 1 Named: employee_id of type: uuid And is a primary key.  Column Comments: NA",
385 |                 "Column No: 2 Named: full_name of type: character varying(255) Column Comments: NA",
386 |                 "Column No: 3 Named: email of type: character varying(255) Column Comments: NA",
387 |                 "Column No: 4 Named: salary of type: numeric Column Comments: NA"
388 |               ]
389 |             }
390 | 
391 |             Column No: 2
392 | 
393 |             Example Output 3)
394 |             {
395 |               "Descriptor - Sensitive": {
396 |                 "Is PII": true,
397 |                 "Confidence Value": 0.95,
398 |                 "Reason": "The 'full_name' column is identified as PII because it matches the 'Person's Name' item from the provided PII list."
399 |               }
400 |             }
401 | 
402 |             Example Input 4)
403 |             JSON Source Table Object:
404 |             {
405 |               "Schema Name": "public",
406 |               "Table Name": "order",
407 |               "Column Details": [
408 |                 "Column No: 1 Named: order_id of type: uuid And is a primary key.  Column Comments: NA",
409 |                 "Column No: 2 Named: order_date of type: date Column Comments: NA",
410 |                 "Column No: 3 Named: customer_email of type: character varying(255) Column Comments: 'Email address of the customer who placed the order'",
411 |                 "Column No: 4 Named: total_amount of type: numeric Column Comments: NA"
412 |               ]
413 |             }
414 | 
415 |             Column No: 3
416 | 
417 |             Example Output 4)
418 |             {
419 |               "Descriptor - Sensitive": {
420 |                 "Is PII": true,
421 |                 "Confidence Value": 0.98,
422 |                 "Reason": "The 'customer_email' column is identified as PII because it matches the 'Email Address' item from the provided PII list."
423 |               }
424 |             }
425 | 
426 |             Now, based on the instructions and examples above, please generate the appropriate JSON output only for the following JSON Source Table Object and Column No inputs.  {hints}
427 | 
428 |             JSON Source Table Object: {new_json}
429 | 
430 |             Column No: {column_no}
431 | 
432 |             "#,
433 |       }
434 |   }
435 | }


--------------------------------------------------------------------------------
/extension/src/model/queries.rs:
--------------------------------------------------------------------------------
  1 | use crate::utility::guc;
  2 | 
  3 | pub const SOURCE_OBJECTS_JSON: &str = r#"
  4 |             WITH
  5 |             table_tranformation_time_cal AS (
  6 |                 SELECT 
  7 |                     s.table_oid, 
  8 |                     MAX(s.valid_from) AS max_table_update, 
  9 |                     MAX(t.created_at) AS max_table_transformer_generation
 10 |                 FROM auto_dw.source_objects AS s
 11 |                 LEFT JOIN auto_dw.transformer_responses AS t ON s.pk_source_objects = t.fk_source_objects
 12 |                 WHERE current_flag = 'Y' AND deleted_flag = 'N'
 13 |                 GROUP BY table_oid),
 14 |             tables_requiring_transformation AS (
 15 |                 SELECT DISTINCT table_oid FROM table_tranformation_time_cal
 16 |                 WHERE (max_table_update > max_table_transformer_generation) OR max_table_transformer_generation IS NULL
 17 |             ),
 18 |             source_table_details AS (
 19 |                 SELECT s.*
 20 |                 FROM auto_dw.source_objects AS s
 21 |                 JOIN tables_requiring_transformation AS t ON s.table_oid = t.table_oid
 22 |                 WHERE current_flag = 'Y' AND deleted_flag = 'N'
 23 |             ),
 24 |             source_prep AS (
 25 |                 SELECT 
 26 |                     table_oid,
 27 |                     column_ordinal_position,
 28 |                     json_build_object(
 29 |                         'PK Source Objects', pk_source_objects,
 30 |                         'Column Ordinal Position', column_ordinal_position
 31 |                     ) AS column_link,
 32 |                     schema_name, table_name, 
 33 |                     'Column No: ' 	|| column_ordinal_position 	|| ' ' ||
 34 |                     'Named: '  		|| column_name 				|| ' ' ||
 35 |                     'of type: ' 	|| column_type_name 		|| ' ' ||
 36 |                     CASE
 37 |                         WHEN column_pk_ind =1 THEN 'And is a primary key.' ELSE ''
 38 |                     END  ||
 39 | 				    'Column Comments: ' || column_description
 40 |                     AS column_details 
 41 |                 FROM source_table_details
 42 |             )
 43 |             SELECT
 44 |             table_oid,
 45 |             json_build_object(
 46 |                 'Column Links', array_agg(column_link ORDER BY column_ordinal_position ASC)
 47 |             ) AS table_column_links,
 48 |             json_build_object(
 49 |                 'Schema Name', schema_name,
 50 |                 'Table Name', table_name,
 51 |                 'Column Details', array_agg(column_details ORDER BY column_ordinal_position ASC)
 52 |             ) AS table_details
 53 |             FROM source_prep
 54 |             GROUP BY table_oid, schema_name, table_name
 55 |             ;
 56 |         "#;
 57 | 
 58 | #[no_mangle]
 59 | pub fn source_object_dw(schema_pattern_include: &str, table_pattern_include: &str, column_pattern_include: &str, schema_pattern_exclude: &str, table_pattern_exclude: &str, column_pattern_exclude: &str) -> String {
 60 | 
 61 | 	let dw_schema = guc::get_guc(guc::PgAutoDWGuc::DwSchema).expect("DW SCHEMA GUC is not set.");
 62 | 
 63 | 	format!(r#"
 64 | DROP TABLE IF EXISTS temp_source_objects;
 65 | 
 66 | CREATE TEMPORARY TABLE temp_source_objects AS
 67 | WITH
 68 | schema_qry AS (
 69 | 	SELECT 
 70 | 		pg_namespace.oid AS schema_oid, 
 71 | 		pg_namespace.nspname AS schema_name,
 72 | 		pg_description.description AS schema_description
 73 | 	FROM pg_catalog.pg_namespace
 74 | 	LEFT JOIN pg_catalog.pg_description ON 	pg_namespace.oid = pg_description.objoid AND 
 75 | 											pg_description.objsubid = 0 -- No Sub Objects
 76 | 	WHERE pg_namespace.nspname !~ 'pg_.*' AND pg_namespace.nspname NOT IN ('information_schema', 'auto_dw', '{dw_schema}')
 77 | ),
 78 | table_qry AS (
 79 | 	SELECT 
 80 | 		pg_class.oid AS table_oid, 
 81 | 		pg_class.relname AS table_name,
 82 | 		pg_class.relnamespace AS table_schema_oid,
 83 | 		pg_description.description AS table_description
 84 | 	FROM pg_catalog.pg_class
 85 | 	LEFT JOIN pg_catalog.pg_description ON 	pg_class.oid = pg_description.objoid AND 
 86 | 											pg_description.objsubid = 0 -- No Sub Objects
 87 | 	WHERE 
 88 | 		pg_class.relkind IN  ('r', 'f')  -- 'r' stands for ordinary table, 'f' stands for foreign data wrapper
 89 | ),
 90 | column_qry AS (
 91 | 	SELECT 
 92 | 		pg_attribute.attrelid AS column_table_oid,
 93 | 		pg_attribute.attname AS column_name,
 94 | 		pg_attribute.attnum AS column_ordinal_position,
 95 | 		pg_attribute.atttypid AS column_type_oid,
 96 | 		pg_attribute.atttypmod  AS column_modification_number,
 97 | 		pg_catalog.format_type(atttypid, atttypmod) AS column_type_name,
 98 | 		pg_description.description AS column_description
 99 | 	FROM pg_attribute
100 | 	LEFT JOIN pg_catalog.pg_description ON 	pg_attribute.attrelid = pg_description.objoid AND 
101 | 											pg_attribute.attnum = pg_description.objsubid
102 | 	WHERE 
103 | 		pg_attribute.attnum > 0  -- Only real columns, not system columns
104 | 		AND NOT pg_attribute.attisdropped  -- Only columns that are not dropped
105 | ),
106 | type_qry AS (
107 | 	SELECT
108 | 		oid AS type_oid,
109 | 		typname AS base_type_name
110 | 	FROM pg_type
111 | ),
112 | pk_table_column_qry AS (
113 | 	SELECT
114 | 		conrelid AS table_oid,
115 | 		unnest(conkey) AS column_ordinal_position,
116 | 		1 AS column_pk_ind,
117 | 		conname AS column_pk_name
118 | 	FROM
119 | 		pg_constraint
120 | 	WHERE
121 | 		contype = 'p'
122 | ),
123 | fk_table_column_qry AS (
124 | 	SELECT DISTINCT -- Distinct one column could have multiple FKs.
125 | 		conrelid AS table_oid,
126 | 		unnest(conkey) AS column_ordinal_position,
127 | 		1 AS column_fk_ind
128 | 	FROM
129 | 		pg_constraint
130 | 	WHERE
131 | 		contype = 'f'
132 | ),
133 | source_objects_prep AS (
134 | 	SELECT
135 | 	schema_qry.schema_oid,
136 | 	schema_qry.schema_name,
137 | 	schema_qry.schema_description,
138 | 	table_qry.table_oid,
139 | 	table_qry.table_name,
140 | 	COALESCE(table_qry.table_description, 'NA') AS table_description,
141 | 	column_qry.column_ordinal_position,
142 | 	column_qry.column_name,
143 | 	type_qry.base_type_name AS column_base_type_name,
144 | 	column_qry.column_modification_number,
145 | 	column_qry.column_type_name,
146 | 	COALESCE(column_qry.column_description, 'NA') AS column_description,
147 | 	COALESCE(pk_table_column_qry.column_pk_ind, 0) AS column_pk_ind,
148 | 	COALESCE(pk_table_column_qry.column_pk_name, 'NA') AS column_pk_name,
149 | 	COALESCE(fk_table_column_qry.column_fk_ind, 0) AS column_fk_ind
150 | 	FROM schema_qry
151 | 	LEFT JOIN table_qry ON schema_qry.schema_oid = table_qry.table_schema_oid
152 | 	LEFT JOIN column_qry ON table_qry.table_oid = column_qry.column_table_oid
153 | 	LEFT JOIN type_qry ON column_qry.column_type_oid = type_qry.type_oid
154 | 	LEFT JOIN pk_table_column_qry ON 
155 | 								table_qry.table_oid = pk_table_column_qry.table_oid AND
156 | 								column_qry.column_ordinal_position = pk_table_column_qry.column_ordinal_position
157 | 	LEFT JOIN fk_table_column_qry ON 
158 | 								table_qry.table_oid = fk_table_column_qry.table_oid AND
159 | 								column_qry.column_ordinal_position = fk_table_column_qry.column_ordinal_position
160 | ),
161 | table_source_list AS (
162 | 	-- Currently on List
163 | 	SELECT
164 | 		schema_oid,
165 | 		table_oid, 
166 | 		column_ordinal_position
167 | 	FROM auto_dw.source_objects
168 | 	WHERE current_flag = 'Y' AND deleted_flag = 'N'
169 | 	-- Adding TABLE COLUMNS
170 | 	UNION
171 | 	SELECT
172 | 		schema_oid,
173 | 		table_oid, 
174 | 		column_ordinal_position
175 | 	FROM source_objects_prep
176 | 	-- 'a^' ~ mach nothing.
177 | 	WHERE 
178 | 		schema_name ~ '{}' AND
179 | 		table_name ~ '{}' AND
180 | 		column_name ~ '{}'
181 | 	--- Removing Schemas
182 | 	EXCEPT
183 | 	SELECT
184 | 		schema_oid,
185 | 		table_oid, 
186 | 		column_ordinal_position
187 | 	FROM source_objects_prep
188 | 	WHERE 
189 | 		schema_name ~ '{}' AND
190 | 		table_name ~ '{}' AND
191 | 		column_name ~ '{}'
192 | )
193 | SELECT
194 | source_objects_prep.schema_oid,
195 | source_objects_prep.schema_name,
196 | source_objects_prep.schema_description,
197 | source_objects_prep.table_oid,
198 | source_objects_prep.table_name,
199 | source_objects_prep.table_description,
200 | source_objects_prep.column_ordinal_position,
201 | source_objects_prep.column_name,
202 | source_objects_prep.column_base_type_name,
203 | source_objects_prep.column_modification_number,
204 | source_objects_prep.column_type_name,
205 | source_objects_prep.column_description,
206 | source_objects_prep.column_pk_ind,
207 | source_objects_prep.column_pk_name,
208 | source_objects_prep.column_fk_ind
209 | FROM source_objects_prep
210 | JOIN table_source_list ON 
211 | 	source_objects_prep.schema_oid = table_source_list.schema_oid AND -- Remove to track tables even if they move schemas.
212 | 	source_objects_prep.table_oid = table_source_list.table_oid AND
213 | 	source_objects_prep.column_ordinal_position = table_source_list.column_ordinal_position
214 | ORDER BY source_objects_prep.schema_name, source_objects_prep.table_name, source_objects_prep.column_ordinal_position
215 | ;
216 | 
217 | -- Mark anything that was deleted.
218 | UPDATE auto_dw.source_objects
219 | SET deleted_flag = 'Y'
220 | WHERE source_objects.current_flag = 'Y'
221 | AND NOT EXISTS (
222 |     SELECT 1
223 |     FROM temp_source_objects
224 |     WHERE source_objects.schema_oid = temp_source_objects.schema_oid
225 | 	  AND source_objects.table_oid = temp_source_objects.table_oid
226 | 	  AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position
227 | );
228 | 
229 | -- If anything associated with current columns change set the current_flg to 'N'
230 | UPDATE auto_dw.source_objects
231 | SET valid_to = (now() AT TIME ZONE 'UTC'), current_flag = 'N'
232 | FROM temp_source_objects
233 | WHERE source_objects.current_flag = 'Y'
234 | 	AND source_objects.schema_oid = temp_source_objects.schema_oid
235 | 	AND source_objects.table_oid = temp_source_objects.table_oid
236 | 	AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position
237 | 	AND (
238 | 	source_objects.schema_name IS DISTINCT FROM temp_source_objects.schema_name OR
239 | 	source_objects.schema_description IS DISTINCT FROM temp_source_objects.schema_description OR
240 | 	source_objects.table_name IS DISTINCT FROM temp_source_objects.table_name OR
241 | 	source_objects.table_description IS DISTINCT FROM temp_source_objects.table_description OR
242 | 	source_objects.column_name IS DISTINCT FROM temp_source_objects.column_name OR
243 | 	source_objects.column_base_type_name IS DISTINCT FROM temp_source_objects.column_base_type_name OR
244 | 	source_objects.column_modification_number IS DISTINCT FROM temp_source_objects.column_modification_number OR
245 | 	source_objects.column_type_name IS DISTINCT FROM temp_source_objects.column_type_name OR
246 | 	source_objects.column_description IS DISTINCT FROM temp_source_objects.column_description OR
247 | 	source_objects.column_pk_ind IS DISTINCT FROM temp_source_objects.column_pk_ind OR
248 | 	source_objects.column_pk_name IS DISTINCT FROM temp_source_objects.column_pk_name OR
249 | 	source_objects.column_fk_ind IS DISTINCT FROM temp_source_objects.column_fk_ind
250 | 	);
251 | 
252 | -- If anything that was deleted from the prior record set comes back.
253 | UPDATE auto_dw.source_objects
254 | SET deleted_flag = 'N'
255 | FROM temp_source_objects
256 | WHERE source_objects.current_flag = 'Y' AND source_objects.deleted_flag = 'Y'
257 | 	AND source_objects.schema_oid = temp_source_objects.schema_oid
258 | 	AND source_objects.table_oid = temp_source_objects.table_oid
259 | 	AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position
260 | 	AND (
261 | 	source_objects.schema_name = temp_source_objects.schema_name OR
262 | 	source_objects.schema_description = temp_source_objects.schema_description OR
263 | 	source_objects.table_name = temp_source_objects.table_name OR
264 | 	source_objects.table_description = temp_source_objects.table_description OR
265 | 	source_objects.column_name = temp_source_objects.column_name OR
266 | 	source_objects.column_base_type_name = temp_source_objects.column_base_type_name OR
267 | 	source_objects.column_modification_number = temp_source_objects.column_modification_number OR
268 | 	source_objects.column_type_name = temp_source_objects.column_type_name OR
269 | 	source_objects.column_description = temp_source_objects.column_description OR
270 | 	source_objects.column_pk_ind = temp_source_objects.column_pk_ind OR
271 | 	source_objects.column_pk_name = temp_source_objects.column_pk_name OR
272 | 	source_objects.column_fk_ind = temp_source_objects.column_fk_ind
273 | 	);
274 | 
275 | -- Inserting new records.
276 | INSERT INTO auto_dw.source_objects (
277 | 	schema_oid,
278 | 	schema_name,
279 | 	schema_description,
280 | 	table_oid,
281 | 	table_name,
282 | 	table_description,
283 | 	column_ordinal_position,
284 | 	column_name,
285 | 	column_base_type_name,
286 | 	column_modification_number,
287 | 	column_type_name,
288 | 	column_description,
289 | 	column_pk_ind,
290 | 	column_pk_name,
291 | 	column_fk_ind
292 | )
293 | SELECT
294 | 	temp_source_objects.schema_oid,
295 | 	temp_source_objects.schema_name,
296 | 	temp_source_objects.schema_description,
297 | 	temp_source_objects.table_oid,
298 | 	temp_source_objects.table_name,
299 | 	temp_source_objects.table_description,
300 | 	temp_source_objects.column_ordinal_position,
301 | 	temp_source_objects.column_name,
302 | 	temp_source_objects.column_base_type_name,
303 | 	temp_source_objects.column_modification_number,
304 | 	temp_source_objects.column_type_name,
305 | 	temp_source_objects.column_description,
306 | 	temp_source_objects.column_pk_ind,
307 | 	temp_source_objects.column_pk_name,
308 | 	temp_source_objects.column_fk_ind
309 | FROM temp_source_objects
310 | LEFT JOIN auto_dw.source_objects ON source_objects.current_flag = 'Y' 
311 | 	AND source_objects.schema_oid = temp_source_objects.schema_oid
312 | 	AND source_objects.table_oid = temp_source_objects.table_oid
313 | 	AND source_objects.column_ordinal_position = temp_source_objects.column_ordinal_position
314 | WHERE source_objects.column_ordinal_position IS NULL;
315 | 
316 | DROP TABLE IF EXISTS temp_source_objects;
317 | "#, schema_pattern_include, table_pattern_include, column_pattern_include, schema_pattern_exclude, table_pattern_exclude, column_pattern_exclude)
318 | }
319 | 
320 | #[no_mangle]
321 | pub fn insert_into_build_call(
322 | 	build_id: &str, build_flag: &str, build_status: &str, status: &str, accepted_transformer_confidence_level: &str
323 | ) -> String {
324 |     format!(r#"
325 |     INSERT INTO auto_dw.build_call (fk_transformer_responses, build_id, build_flag, build_status)
326 | 	WITH
327 | 	confidence_level AS (SELECT {accepted_transformer_confidence_level} AS value),
328 | 	source_objects_tranformation_cal AS (
329 | 		SELECT 
330 | 			MAX(pk_transformer_responses)AS max_pk_transformer_response
331 | 		FROM auto_dw.transformer_responses AS t
332 | 		GROUP BY fk_source_objects
333 | 	),
334 | 	source_object_transformation_latest AS (
335 | 		SELECT t.* FROM auto_dw.transformer_responses AS t
336 | 		JOIN source_objects_tranformation_cal AS c ON t.pk_transformer_responses = c.max_pk_transformer_response
337 | 	),
338 | 	source_object_status_prep AS (
339 | 		SELECT 
340 | 			t.pk_transformer_responses,
341 | 			s.schema_name,
342 | 			s.table_name,
343 | 			s.column_name,
344 | 			s.column_ordinal_position,
345 | 			t.confidence_score,
346 | 			t.reason,
347 | 			t.category,
348 | 			t.model_name,
349 | 			CASE
350 | 				WHEN dws.column_ordinal_position IS NOT NULL THEN true ELSE false
351 | 			END AS is_dw,
352 | 			MAX(
353 | 				CASE
354 | 					WHEN t.category = 'Business Key Part' AND t.confidence_score < cl.value THEN 1
355 | 					ELSE 0 				  
356 | 				END
357 | 			) OVER (PARTITION BY s.schema_name, s.table_name) AS bk_hold,
358 | 			SUM(
359 | 				CASE
360 | 					WHEN t.category = 'Business Key Part' THEN 1
361 | 					ELSE 0 				  
362 | 				END
363 | 			) OVER (PARTITION BY s.schema_name, s.table_name) AS bkp_cnt
364 | 		FROM auto_dw.source_objects AS s
365 | 		JOIN confidence_level AS cl ON true
366 | 		LEFT JOIN auto_dw.dw_source_objects AS dws ON s.table_oid = dws.table_oid AND  s.column_ordinal_position = dws.column_ordinal_position
367 | 		LEFT JOIN source_object_transformation_latest AS t ON s.pk_source_objects = t.fk_source_objects
368 | 		WHERE s.current_flag = 'Y' AND s.deleted_flag = 'N'
369 | 	),
370 | 	source_object AS (
371 | 		SELECT *,
372 | 				CASE
373 | 					WHEN is_dw THEN 'Built'
374 | 					WHEN confidence_score IS NULL THEN 'Queued for Processing'
375 | 					-- Links
376 | 					WHEN category = 'Business Key Part' AND confidence_score >= cl.value 					AND bkp_cnt > 1 	THEN 'Ready to Deploy'
377 | 					WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 	AND bkp_cnt > 1 	THEN 'Ready to Deploy'
378 | 					WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 	AND bkp_cnt > 1 	THEN 'Ready to Deploy - Awaiting Business Key (BK)'
379 | 					-- Hubs
380 | 					WHEN category = 'Business Key Part' AND confidence_score >= cl.value 										THEN 'Ready to Deploy'
381 | 					WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 						THEN 'Ready to Deploy'
382 | 					WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 						THEN 'Ready to Deploy - Awaiting Business Key (BK)'
383 | 		
384 | 					ELSE 'Requires Attention'
385 | 				END AS status,
386 | 				CASE 
387 | 					WHEN confidence_score IS NOT NULL THEN CONCAT((confidence_score * 100)::INT::TEXT, '%')
388 | 					ELSE '-'
389 | 				END AS confidence_level,
390 | 				CASE 
391 | 					WHEN confidence_score IS NOT NULL THEN 
392 | 						(
393 | 						'Status: ' ||
394 | 						CASE
395 | 							WHEN confidence_score IS NULL THEN 'Queued for Processing'
396 | 							WHEN confidence_score >= cl.value THEN 'Ready to Deploy'
397 | 							ELSE 'Requires Attention'
398 | 						END || ': ' ||
399 | 						'Model: ' || model_name || 
400 | 						' categorized this column as a ' || category || 
401 | 						' with a confidence of ' || CONCAT((confidence_score * 100)::INT::TEXT, '%') || '.  ' ||
402 | 						'Model Reasoning: ' || reason
403 | 						)
404 | 					ELSE '-'
405 | 				END AS status_response
406 | 		FROM source_object_status_prep
407 | 		JOIN confidence_level AS cl ON true
408 | 	)
409 | 	SELECT 
410 | 		pk_transformer_responses AS fk_transformer_responses,
411 | 		'{build_id}' AS build_id,
412 | 		'{build_flag}' AS build_flag,
413 | 		'{build_status}' AS build_status
414 | 	FROM source_object
415 | 	WHERE status = '{status}';
416 | "#)
417 | }
418 | 
419 | #[no_mangle]
420 | pub fn build_object_pull(build_id: &str) -> String {
421 |     format!(r#"
422 | 		WITH system AS (
423 | 			SELECT system_identifier AS id FROM pg_control_system() LIMIT 1
424 | 		)
425 | 		SELECT 
426 | 		schema_name::TEXT AS schema_name, 
427 | 		table_name::TEXT AS table_name, 
428 | 		category::TEXT AS column_category,
429 | 		business_key_name::TEXT AS business_key_name,
430 | 		column_name::TEXT AS column_name, 
431 | 		column_type_name::TEXT AS column_type_name, 
432 | 		system.id::BIGINT AS system_id,
433 | 		so.table_oid::OID as table_oid,
434 | 		so.column_ordinal_position::SMALLINT AS column_ordinal_position
435 | 		FROM system, auto_dw.build_call AS bc
436 | 		LEFT JOIN auto_dw.transformer_responses AS t ON bc.fk_transformer_responses = t.pk_transformer_responses
437 | 		LEFT JOIN auto_dw.source_objects AS so ON t.fk_source_objects = so.pk_source_objects
438 | 		WHERE build_id = '{}';
439 | 		"#, build_id)
440 | }
441 | 
442 | #[no_mangle]
443 | pub fn source_column(accepted_transformer_confidence_level: &str) -> String {
444 |     format!(r#"
445 |         WITH
446 | 		confidence_level AS (SELECT {accepted_transformer_confidence_level} AS value),
447 | 		source_objects_tranformation_cal AS (
448 | 			SELECT 
449 | 				MAX(pk_transformer_responses)AS max_pk_transformer_response
450 | 			FROM auto_dw.transformer_responses AS t
451 | 			GROUP BY fk_source_objects
452 | 		),
453 | 		source_object_transformation_latest AS (
454 | 			SELECT t.* FROM auto_dw.transformer_responses AS t
455 | 			JOIN source_objects_tranformation_cal AS c ON t.pk_transformer_responses = c.max_pk_transformer_response
456 | 		),
457 | 		source_object_status_prep AS (
458 | 			SELECT 
459 | 				t.pk_transformer_responses,
460 | 				s.schema_name,
461 | 				s.schema_oid,
462 | 				s.table_name,
463 | 				s.table_oid,
464 | 				s.column_name,
465 | 				s.column_ordinal_position,
466 | 				t.confidence_score,
467 | 				t.reason,
468 | 				t.category,
469 | 				t.model_name,
470 | 			    CASE
471 | 					WHEN dws.column_ordinal_position IS NOT NULL THEN true ELSE false
472 | 				END AS is_dw,
473 | 				MAX(
474 | 				CASE
475 | 					WHEN t.category = 'Business Key Part' AND t.confidence_score < cl.value THEN 1
476 | 					ELSE 0 				  
477 | 				END
478 | 				) OVER (PARTITION BY s.schema_name, s.table_name) AS bk_hold,
479 | 			SUM(
480 | 				CASE
481 | 					WHEN t.category = 'Business Key Part' THEN 1
482 | 					ELSE 0 				  
483 | 				END
484 | 			) OVER (PARTITION BY s.schema_name, s.table_name) AS bkp_cnt
485 | 			FROM auto_dw.source_objects AS s
486 | 			JOIN confidence_level AS cl ON true
487 | 			LEFT JOIN auto_dw.dw_source_objects AS dws ON s.table_oid = dws.table_oid AND  s.column_ordinal_position = dws.column_ordinal_position
488 | 			LEFT JOIN source_object_transformation_latest AS t ON s.pk_source_objects = t.fk_source_objects
489 | 			WHERE s.current_flag = 'Y' AND s.deleted_flag = 'N'
490 | 		),
491 | 		source_object AS (
492 | 			SELECT *,
493 | 					CASE
494 | 						WHEN is_dw THEN 'Built'
495 | 						WHEN confidence_score IS NULL THEN 'Queued for Processing'
496 | 						-- Links
497 | 						WHEN category = 'Business Key Part' AND confidence_score >= cl.value 					AND bkp_cnt > 1 	THEN 'Ready'
498 | 						WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 	AND bkp_cnt > 1 	THEN 'Ready'
499 | 						WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 	AND bkp_cnt > 1 	THEN 'Ready - Awaiting Business Key (BK)'
500 | 						-- Hubs
501 | 						WHEN category = 'Business Key Part' AND confidence_score >= cl.value 										THEN 'Ready'
502 | 						WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 0 						THEN 'Ready'
503 | 						WHEN category <> 'Business Key Part' AND confidence_score >= cl.value AND bk_hold = 1 						THEN 'Ready - Awaiting Business Key (BK)'
504 | 			
505 | 						ELSE 'Requires Attention'
506 | 					END AS status,
507 | 					CASE 
508 | 						WHEN confidence_score IS NOT NULL THEN CONCAT((confidence_score * 100)::INT::TEXT, '%')
509 | 						ELSE '-'
510 | 					END AS confidence_level,
511 | 					CASE 
512 | 						WHEN confidence_score IS NOT NULL THEN 
513 | 							(
514 | 							'Status: ' ||
515 | 							CASE
516 | 								WHEN confidence_score IS NULL THEN 'Queued for Processing'
517 | 								WHEN confidence_score >= cl.value THEN 'Ready'
518 | 								ELSE 'Requires Attention'
519 | 							END || ': ' ||
520 | 							'Model: ' || model_name || 
521 | 							' categorized this column as a ' || category || 
522 | 							' with a confidence of ' || CONCAT((confidence_score * 100)::INT::TEXT, '%') || '.  ' ||
523 | 							'Model Reasoning: ' || reason
524 | 							)
525 | 						ELSE '-'
526 | 					END AS status_response
527 | 			FROM source_object_status_prep
528 | 			JOIN confidence_level AS cl ON true
529 | 		)
530 | 		SELECT 
531 | 		schema_name::TEXT AS schema,
532 | 		schema_oid AS schema_oid,
533 | 		table_name::TEXT AS table,
534 | 		table_oid AS table_oid,
535 | 		column_name::TEXT AS column,
536 | 		column_ordinal_position AS column_ordinal_position,
537 | 		status,
538 | 		CASE
539 | 			WHEN category IS NULL THEN '-'
540 | 			WHEN category LIKE 'Descriptor - Sensitive' THEN 'Descriptor'
541 | 			ELSE category
542 | 		END::TEXT AS category,
543 | 		CASE
544 | 			WHEN category IS NULL THEN '-'
545 | 			WHEN category LIKE 'Descriptor - Sensitive' THEN 'True'
546 | 			ELSE 'False'
547 | 		END::TEXT AS is_sensitive,
548 | 		confidence_level,
549 | 		status_response
550 | 		FROM source_object
551 | 		ORDER BY schema_name, table_name, column_ordinal_position
552 | 		;
553 | 		"#)
554 | }
555 | 
556 | #[no_mangle]
557 | pub fn get_column_data(schema_name: &str, table_name: &str, column_name: &str) -> String {
558 |     format!(r#"
559 | 		WITH 
560 | 		system_qry AS (
561 | 			SELECT system_identifier AS id FROM pg_control_system() LIMIT 1
562 | 		),
563 | 		schema_qry AS (
564 | 			SELECT 
565 | 				pg_namespace.oid AS schema_oid, 
566 | 				pg_namespace.nspname AS schema_name,
567 | 				pg_description.description AS schema_description
568 | 			FROM pg_catalog.pg_namespace
569 | 			LEFT JOIN pg_catalog.pg_description ON 	pg_namespace.oid = pg_description.objoid AND 
570 | 													pg_description.objsubid = 0 -- No Sub Objects
571 | 			WHERE pg_namespace.nspname !~ 'pg_.*' AND pg_namespace.nspname NOT IN ('information_schema', 'auto_dw')
572 | 		),
573 | 		table_qry AS (
574 | 			SELECT 
575 | 				pg_class.oid AS table_oid, 
576 | 				pg_class.relname AS table_name,
577 | 				pg_class.relnamespace AS table_schema_oid,
578 | 				pg_description.description AS table_description
579 | 			FROM pg_catalog.pg_class
580 | 			LEFT JOIN pg_catalog.pg_description ON 	pg_class.oid = pg_description.objoid AND 
581 | 													pg_description.objsubid = 0 -- No Sub Objects
582 | 			WHERE 
583 | 				pg_class.relkind IN  ('r', 'f')  -- 'r' stands for ordinary table, 'f' stands for foreign data wrapper
584 | 		),
585 | 		column_qry AS (
586 | 			SELECT 
587 | 				pg_attribute.attrelid AS column_table_oid,
588 | 				pg_attribute.attname AS column_name,
589 | 				pg_attribute.attnum AS column_ordinal_position,
590 | 				pg_attribute.atttypid AS column_type_oid,
591 | 				pg_attribute.atttypmod  AS column_modification_number,
592 | 				pg_catalog.format_type(atttypid, atttypmod) AS column_type_name,
593 | 				pg_description.description AS column_description
594 | 			FROM pg_attribute
595 | 			LEFT JOIN pg_catalog.pg_description ON 	pg_attribute.attrelid = pg_description.objoid AND 
596 | 													pg_attribute.attnum = pg_description.objsubid
597 | 			WHERE 
598 | 				pg_attribute.attnum > 0  -- Only real columns, not system columns
599 | 				AND NOT pg_attribute.attisdropped  -- Only columns that are not dropped
600 | 		),
601 | 		type_qry AS (
602 | 			SELECT
603 | 				oid AS type_oid,
604 | 				typname AS base_type_name
605 | 			FROM pg_type
606 | 		),
607 | 		pk_table_column_qry AS (
608 | 			SELECT
609 | 				conrelid AS table_oid,
610 | 				unnest(conkey) AS column_ordinal_position,
611 | 				1 AS column_pk_ind,
612 | 				conname AS column_pk_name
613 | 			FROM
614 | 				pg_constraint
615 | 			WHERE
616 | 				contype = 'p'
617 | 		),
618 | 		fk_table_column_qry AS (
619 | 			SELECT DISTINCT -- Distinct one column could have multiple FKs.
620 | 				conrelid AS table_oid,
621 | 				unnest(conkey) AS column_ordinal_position,
622 | 				1 AS column_fk_ind
623 | 			FROM
624 | 				pg_constraint
625 | 			WHERE
626 | 				contype = 'f'
627 | 		),
628 | 		source_objects_prep AS (
629 | 			SELECT
630 | 			schema_qry.schema_oid,
631 | 			schema_qry.schema_name,
632 | 			schema_qry.schema_description,
633 | 			table_qry.table_oid,
634 | 			table_qry.table_name,
635 | 			COALESCE(table_qry.table_description, 'NA') AS table_description,
636 | 			column_qry.column_ordinal_position,
637 | 			column_qry.column_name,
638 | 			type_qry.base_type_name AS column_base_type_name,
639 | 			column_qry.column_modification_number,
640 | 			column_qry.column_type_name,
641 | 			COALESCE(column_qry.column_description, 'NA') AS column_description,
642 | 			COALESCE(pk_table_column_qry.column_pk_ind, 0) AS column_pk_ind,
643 | 			COALESCE(pk_table_column_qry.column_pk_name, 'NA') AS column_pk_name,
644 | 			COALESCE(fk_table_column_qry.column_fk_ind, 0) AS column_fk_ind
645 | 			FROM schema_qry
646 | 			LEFT JOIN table_qry ON schema_qry.schema_oid = table_qry.table_schema_oid
647 | 			LEFT JOIN column_qry ON table_qry.table_oid = column_qry.column_table_oid
648 | 			LEFT JOIN type_qry ON column_qry.column_type_oid = type_qry.type_oid
649 | 			LEFT JOIN pk_table_column_qry ON 
650 | 										table_qry.table_oid = pk_table_column_qry.table_oid AND
651 | 										column_qry.column_ordinal_position = pk_table_column_qry.column_ordinal_position
652 | 			LEFT JOIN fk_table_column_qry ON 
653 | 										table_qry.table_oid = fk_table_column_qry.table_oid AND
654 | 										column_qry.column_ordinal_position = fk_table_column_qry.column_ordinal_position
655 | 		)
656 | 		SELECT
657 | 			system_qry.id::BIGINT AS system_id,
658 | 			source_objects_prep.schema_oid::OID as schema_oid,
659 | 			source_objects_prep.schema_name::TEXT AS schema_name, 
660 | 			source_objects_prep.table_name::TEXT AS table_name,
661 | 			source_objects_prep.table_oid::OID as table_oid, 
662 | 			source_objects_prep.column_name::TEXT AS column_name,
663 | 			source_objects_prep.column_ordinal_position::SMALLINT AS column_ordinal_position, 
664 | 			source_objects_prep.column_type_name::TEXT AS column_type_name 
665 | 		FROM source_objects_prep, system_qry
666 | 		WHERE
667 | 		schema_name = '{}' AND
668 | 		table_name = '{}' AND
669 | 		column_name = '{}'
670 | 		;
671 | 		"#, schema_name, table_name, column_name)
672 | }


--------------------------------------------------------------------------------
/extension/src/model/source_objects.rs:
--------------------------------------------------------------------------------
 1 | use pgrx::{Json as JsonValue, pg_sys::Oid};
 2 | use serde::{Deserialize, Deserializer,  Serialize};
 3 | 
 4 | #[derive(Debug)]
 5 | pub struct SourceTablePrompt {
 6 |     #[allow(dead_code)]
 7 |     pub key: Oid,
 8 |     pub table_column_links: JsonValue, // For linking columns to foreign keys
 9 |     pub table_details: JsonValue,
10 | }
11 | 
12 | #[derive(Debug, Serialize, Deserialize, Clone)]
13 | pub struct SourceTableDetail {
14 |     #[serde(rename = "Schema Name")]
15 |     pub schema_name: String,
16 | 
17 |     #[serde(rename = "Table Name")]
18 |     pub table_name: String,
19 | 
20 |     #[serde(rename = "Column Details")]
21 |     pub column_details: Vec<String>,
22 | }
23 | 
24 | #[derive(Debug, Serialize, Deserialize)]
25 | pub struct Response {
26 |     #[serde(rename = "Table ID")]
27 |     pub table_id: u32,
28 |     #[serde(rename = "Generation")]
29 |     pub generation: GenerationTableDetail,
30 | }
31 | 
32 | #[derive(Debug, Serialize, Deserialize, Clone)]
33 | pub struct GenerationColumnDetail {
34 |     #[serde(rename = "Category")]
35 |     pub category: String,
36 |     #[serde(rename = "Business Key Name", deserialize_with = "replace_spaces_with_underscores")]
37 |     pub business_key_name: String,
38 |     #[serde(rename = "Column No")]
39 |     pub column_no: i32,
40 |     #[serde(rename = "Confidence")]
41 |     pub confidence: f64,
42 |     #[serde(rename = "Reason")]
43 |     pub reason: String,
44 | }
45 | 
46 | #[derive(Debug, Serialize, Deserialize, Clone)]
47 | pub struct GenerationTableDetail {
48 |     #[serde(rename = "Schema Name")]
49 |     pub schema_name: String,
50 |     #[serde(rename = "Table Name")]
51 |     pub table_name: String,
52 |     #[serde(rename = "Column Details")]
53 |     pub response_column_details: Vec<GenerationColumnDetail>,
54 | }
55 | 
56 | #[derive(Debug, Serialize, Deserialize)]
57 | pub struct ColumnLink {
58 |     #[serde(rename = "Column Ordinal Position")]
59 |     pub column_ordinal_position: i32,
60 |     #[serde(rename = "PK Source Objects")]
61 |     pub pk_source_objects: i32,
62 | }
63 | 
64 | #[derive(Debug, Serialize, Deserialize)]
65 | pub struct TableLinks {
66 |     #[serde(rename = "Column Links")]
67 |     pub column_links: Vec<ColumnLink>,
68 | }
69 | 
70 | impl TableLinks {
71 |     // Method to find the pk_source_objects based on column_ordinal_position
72 |     pub fn find_pk_source_objects(&self, search_position: i32) -> Option<i32> {
73 |         for link in &self.column_links {
74 |             if link.column_ordinal_position == search_position {
75 |                 return Some(link.pk_source_objects);
76 |             }
77 |         }
78 |         None
79 |     }
80 | }
81 | 
82 | fn replace_spaces_with_underscores<'de, D>(deserializer: D) -> Result<String, D::Error>
83 | where
84 |     D: Deserializer<'de>,
85 | {
86 |     let s = String::deserialize(deserializer)?;
87 |     Ok(s.replace(' ', "_"))
88 | }


--------------------------------------------------------------------------------
/extension/src/utility/guc.rs:
--------------------------------------------------------------------------------
  1 | use pgrx::guc::*;
  2 | use std::ffi::CStr;
  3 | 
  4 | // Default not set due to security boundaries associated with extension install.
  5 | // The background process has no way to determine which database the extension is installed in.
  6 | // When the extension is being created, the database name can only be saved at the session level into the GUC.
  7 | pub static PG_AUTO_DW_DATABASE_NAME: GucSetting<Option<&CStr>> = GucSetting::<Option<&CStr>>::new(None);
  8 | 
  9 | // Default not set, as this will make direct changes to the database
 10 | pub static PG_AUTO_DW_DW_SCHEMA: GucSetting<Option<&CStr>> = GucSetting::<Option<&CStr>>::new(None);
 11 | 
 12 | // Default set to Ollama
 13 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_TYPE: GucSetting<Option<&CStr>> = GucSetting::<Option<&CStr>>::new(Some(unsafe {
 14 |     CStr::from_bytes_with_nul_unchecked(b"ollama\0")
 15 | }));
 16 | 
 17 | // Default Transformer Server URL
 18 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_URL: GucSetting<Option<&CStr>> = GucSetting::<Option<&CStr>>::new(Some(unsafe {
 19 |     CStr::from_bytes_with_nul_unchecked(b"http://localhost:11434/api/generate\0")
 20 | }));
 21 | 
 22 | // Default not set
 23 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_TOKEN: GucSetting<Option<&CStr>> = GucSetting::<Option<&CStr>>::new(None);
 24 | 
 25 | // Default model is "mistral"
 26 | pub static PG_AUTO_DW_MODEL: GucSetting<Option<&CStr>> = GucSetting::<Option<&CStr>>::new(Some(unsafe {
 27 |     CStr::from_bytes_with_nul_unchecked(b"mistral\0")
 28 | }));
 29 | 
 30 | // The accepted transformer's, self-described, confidence level - default 0.8.
 31 | pub static PG_AUTO_DW_ACCEPTED_TRANSFORMER_CONFIDENCE_LEVEL: GucSetting<f64> = GucSetting::<f64>::new(0.8);
 32 | 
 33 | // Number of times the transformer can be given the same request if a failure is recognized - default 3.
 34 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_MAX_RETRIES: GucSetting<i32> = GucSetting::<i32>::new(3);
 35 | 
 36 | // Number of seconds to wait for the transformers response - default 60 sec.
 37 | pub static PG_AUTO_DW_TRANSFORMER_SERVER_WAIT_DURATION: GucSetting<i32> = GucSetting::<i32>::new(60);
 38 | 
 39 | pub fn init_guc() {
 40 |     // Register the GUCs
 41 |     GucRegistry::define_string_guc(
 42 |         "pg_auto_dw.database_name",
 43 |         "Database name for the pg_auto_dw extension.",
 44 |         "Specifies the name of the database where the pg_auto_dw extension will be utilized.",
 45 |         &PG_AUTO_DW_DATABASE_NAME,
 46 |         GucContext::Suset,
 47 |         GucFlags::default(),
 48 |     );
 49 | 
 50 |     GucRegistry::define_string_guc(
 51 |         "pg_auto_dw.dw_schema",
 52 |         "Data warehouse schema for the pg_auto_dw extension.",
 53 |         "Specifies the name of the schema within the database where the pg_auto_dw extension will automatically create and store data warehouse components.",
 54 |         &PG_AUTO_DW_DW_SCHEMA,
 55 |         GucContext::Suset,
 56 |         GucFlags::default(),
 57 |     );
 58 | 
 59 |     GucRegistry::define_string_guc(
 60 |         "pg_auto_dw.transformer_server_type",
 61 |         "Transformer server type for the pg_auto_dw extension.",
 62 |         "Specifies the server type used by the pg_auto_dw extension.  Current available server types include, ollama and openai.",
 63 |         &PG_AUTO_DW_TRANSFORMER_SERVER_TYPE,
 64 |         GucContext::Suset,
 65 |         GucFlags::default(),
 66 |     );
 67 | 
 68 |     GucRegistry::define_string_guc(
 69 |         "pg_auto_dw.transformer_server_url",
 70 |         "Transformer URL for the pg_auto_dw extension.",
 71 |         "Specifies the URL for the transformer service used by the pg_auto_dw extension.",
 72 |         &PG_AUTO_DW_TRANSFORMER_SERVER_URL,
 73 |         GucContext::Suset,
 74 |         GucFlags::default(),
 75 |     );
 76 | 
 77 |     GucRegistry::define_string_guc(
 78 |         "pg_auto_dw.transformer_server_token",
 79 |         "Bearer token for authenticating API calls to the Transformer Server for the pg_auto_dw extension.",
 80 |         "The Bearer token is required for authenticating API calls to the Transformer Server when interacting with the pg_auto_dw extension.",
 81 |         &PG_AUTO_DW_TRANSFORMER_SERVER_TOKEN,
 82 |         GucContext::Suset,
 83 |         GucFlags::default(),
 84 |     );
 85 | 
 86 |     GucRegistry::define_string_guc(
 87 |         "pg_auto_dw.model",
 88 |         "Transformer model for the pg_auto_dw extension.",
 89 |         "Specifies the transformer model to be used by the pg_auto_dw extension for data processing or analysis.",
 90 |         &PG_AUTO_DW_MODEL,
 91 |         GucContext::Suset,
 92 |         GucFlags::default(),
 93 |     );
 94 | 
 95 |     GucRegistry::define_float_guc(
 96 |         "pg_auto_dw.accepted_transformer_confidence_level",
 97 |         "Transformer generated confidence level for the pg_auto_dw extension.",
 98 |         "Specifies the confidence level threshold generated by the transformer model for the operations performed by the pg_auto_dw extension.",
 99 |         &PG_AUTO_DW_ACCEPTED_TRANSFORMER_CONFIDENCE_LEVEL,
100 |         0.0, // min value
101 |         1.0, // max value
102 |         GucContext::Suset,
103 |         GucFlags::default(),
104 |     );
105 | 
106 |     GucRegistry::define_int_guc(
107 |         "pg_auto_dw.transformer_server_max_retries",
108 |         "Maximum Transformer Retries",
109 |         "Specifies the number of retry attempts the pg_auto_dw extension can make for a transformer request in case of failure.",
110 |         &PG_AUTO_DW_TRANSFORMER_SERVER_MAX_RETRIES,
111 |         1, // min value
112 |         10, // max value
113 |         GucContext::Suset,
114 |         GucFlags::default(),
115 |     );
116 | 
117 |     GucRegistry::define_int_guc(
118 |         "pg_auto_dw.transformer_server_wait_duration",
119 |         "Maximum Transformer Server Wait Time",
120 |         "Specifies the maximum number of seconds the pg_auto_dw extension will wait for a response from the transformer server.",
121 |         &PG_AUTO_DW_TRANSFORMER_SERVER_WAIT_DURATION,
122 |         1, // min value
123 |         360, // max value
124 |         GucContext::Suset,
125 |         GucFlags::default(),
126 |     );
127 | 
128 | }
129 | 
130 | // For handling of GUCs that can be error prone
131 | #[derive(Clone, Debug)]
132 | pub enum PgAutoDWGuc {
133 |     DatabaseName,
134 |     DwSchema,
135 |     TransformerServerType,
136 |     TransformerServerUrl,
137 |     TransformerServerToken,
138 |     TransformerServerWaitDuration,
139 |     TransformerServerMaxRetries,
140 |     Model,
141 |     AcceptedTransformerConfidenceLevel,
142 | }
143 | 
144 | // A convenience function to get this project's GUCs
145 | pub fn get_guc(guc: PgAutoDWGuc) -> Option<String> {
146 |     match guc { 
147 |         PgAutoDWGuc::DatabaseName => cstr_option_to_string(PG_AUTO_DW_DATABASE_NAME.get()),
148 |         PgAutoDWGuc::DwSchema => cstr_option_to_string(PG_AUTO_DW_DW_SCHEMA.get()),
149 |         PgAutoDWGuc::TransformerServerType => cstr_option_to_string(PG_AUTO_DW_TRANSFORMER_SERVER_TYPE.get()),
150 |         PgAutoDWGuc::TransformerServerUrl => cstr_option_to_string(PG_AUTO_DW_TRANSFORMER_SERVER_URL.get()),
151 |         PgAutoDWGuc::TransformerServerToken => cstr_option_to_string(PG_AUTO_DW_TRANSFORMER_SERVER_TOKEN.get()),
152 |         PgAutoDWGuc::TransformerServerWaitDuration => cstr_from_int(PG_AUTO_DW_TRANSFORMER_SERVER_WAIT_DURATION.get()),
153 |         PgAutoDWGuc::TransformerServerMaxRetries => cstr_from_int(PG_AUTO_DW_TRANSFORMER_SERVER_MAX_RETRIES.get()),
154 |         PgAutoDWGuc::Model => cstr_option_to_string(PG_AUTO_DW_MODEL.get()),
155 |         PgAutoDWGuc::AcceptedTransformerConfidenceLevel => cstr_from_float(PG_AUTO_DW_ACCEPTED_TRANSFORMER_CONFIDENCE_LEVEL.get()),
156 |     }
157 | }
158 | 
159 | fn cstr_option_to_string(cstr_o: Option<&CStr>) -> Option<String> {
160 |     cstr_o
161 |         .and_then(|cstr| cstr.to_str().ok().map(|s| s.to_owned()))
162 | }
163 | 
164 | fn cstr_from_float(val: f64) -> Option<String> {
165 |     Some(val.to_string())
166 | }
167 | 
168 | fn cstr_from_int(val: i32) -> Option<String> {
169 |     Some(val.to_string())
170 | }
171 | 
172 | 


--------------------------------------------------------------------------------
/extension/src/utility/mod.rs:
--------------------------------------------------------------------------------
1 | pub mod transformer_client;
2 | mod ollama_client;
3 | mod openai_client;
4 | pub mod setup;
5 | pub mod guc;


--------------------------------------------------------------------------------
/extension/src/utility/ollama_client.rs:
--------------------------------------------------------------------------------
 1 | use reqwest::ClientBuilder;
 2 | use serde::{Deserialize, Serialize};
 3 | use std::time::Duration;
 4 | 
 5 | use crate::utility::guc;
 6 | use crate::model::prompt_template::PromptTemplate;
 7 | 
 8 | #[derive(Serialize, Debug)]
 9 | pub struct GenerateRequest {
10 |     pub model: String,
11 |     pub prompt: String,
12 |     pub format: String,
13 |     pub stream: bool,
14 |     pub options: Options,
15 | }
16 | 
17 | #[derive(Serialize, Debug)]
18 | pub struct Options {
19 |   pub temperature: f64,
20 | }
21 | 
22 | #[derive(Deserialize, Debug)]
23 | #[allow(dead_code)]
24 | pub struct GenerateResponse {
25 |     pub model: String,
26 |     pub created_at: String,
27 |     pub response: String,
28 |     pub done: bool,
29 | }
30 | 
31 | pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str, timout_in_sec: u64) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
32 | 
33 |     let client = ClientBuilder::new().timeout(Duration::from_secs(timout_in_sec)).build()?; // 30 sec Default to short for some LLMS.
34 |     
35 |     let prompt_template = template_type.template();
36 | 
37 |     // Inject new_json into the prompt_template'
38 |     let column_number = col.to_string();
39 |     let prompt = prompt_template
40 |                           .replace("{new_json}", new_json)
41 |                           .replace("{column_no}", &column_number)
42 |                           .replace("{hints}", &hints);  
43 | 
44 |     // GUC Values for the transformer server
45 |     let transformer_server_url = guc::get_guc(guc::PgAutoDWGuc::TransformerServerUrl).ok_or("GUC: Transformer Server URL is not set")?;
46 |     let model = guc::get_guc(guc::PgAutoDWGuc::Model).ok_or("MODEL GUC is not set.")?;
47 | 
48 |     let temperature: f64 = 0.75;
49 | 
50 |     let options: Options = Options{
51 |       temperature,
52 |     };
53 | 
54 |     let request = GenerateRequest {
55 |         model,
56 |         prompt,
57 |         format: "json".to_string(),
58 |         stream: false,
59 |         options,
60 |     };
61 | 
62 |     let response = client
63 |         .post(&transformer_server_url)
64 |         .json(&request)
65 |         .send()
66 |         .await?
67 |         .json::<GenerateResponse>()
68 |         .await?;
69 | 
70 |     // Deserialize
71 |     let response_json: serde_json::Value = serde_json::from_str(&response.response)?;
72 | 
73 |     Ok(response_json)
74 | }
75 | 


--------------------------------------------------------------------------------
/extension/src/utility/openai_client.rs:
--------------------------------------------------------------------------------
  1 | use pgrx::prelude::*;
  2 | 
  3 | use reqwest::ClientBuilder;
  4 | use serde::{Deserialize, Serialize};
  5 | use std::time::Duration;
  6 | 
  7 | use crate::utility::guc;
  8 | use crate::model::prompt_template::PromptTemplate;
  9 | 
 10 | #[derive(Serialize, Debug)]
 11 | pub struct Request {
 12 |     pub model: String,               // Model name for OpenAI
 13 |     pub messages: Vec<Message>,      // List of messages for chat format
 14 |     pub temperature: f64,            // Temperature setting
 15 |     pub response_format: ResponseFormat,  // JSON-only response format field
 16 | }
 17 | 
 18 | #[derive(Serialize, Deserialize, Debug)]
 19 | pub struct Message {
 20 |     pub role: String,                // "user", "assistant", or "system"
 21 |     pub content: String,             // The actual prompt or message content
 22 | }
 23 | 
 24 | #[derive(Serialize, Debug)]
 25 | pub struct ResponseFormat {
 26 |     #[serde(rename = "type")] 
 27 |     pub r#type: String,              // To ensure JSON response format
 28 | }
 29 | 
 30 | #[derive(Serialize, Deserialize, Debug)]
 31 | pub struct Response {
 32 |     pub id: String,                 // Unique identifier for the chat session
 33 |     pub object: String,             // Object type, usually "chat.completion"
 34 |     pub created: u64,               // Timestamp when the response was created
 35 |     pub model: String,              // Model name used for the response
 36 |     pub choices: Vec<Choice>,       // List of choices (contains the actual answer)
 37 |     pub usage: Usage,               // Information about token usage
 38 | }
 39 | 
 40 | #[derive(Serialize, Deserialize, Debug)]
 41 | pub struct Choice {
 42 |     pub message: Message,                   // Contains the assistant's message
 43 |     pub finish_reason: Option<String>,      // Reason for stopping (e.g., "stop")
 44 |     pub index: usize,                       // Index of the choice
 45 |     pub logprobs: Option<serde_json::Value>,// Log probabilities (if applicable)
 46 | }
 47 | 
 48 | #[derive(Serialize, Deserialize, Debug)]
 49 | pub struct Usage {
 50 |     pub prompt_tokens: u32,         // Number of tokens in the prompt
 51 |     pub completion_tokens: u32,     // Number of tokens in the completion
 52 |     pub total_tokens: u32,          // Total number of tokens used
 53 | }
 54 | 
 55 | pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str, timout_in_sec: u64) -> Result<serde_json::Value, Box<dyn std::error::Error>> {
 56 | 
 57 |     let client = ClientBuilder::new().timeout(Duration::from_secs(timout_in_sec)).build()?; // 30 sec Default to short for some LLMS.
 58 |     
 59 |     let prompt_template = template_type.template();
 60 |     // let prompt_template = PromptTemplate::Test.template();
 61 | 
 62 |     // Inject new_json into the prompt_template'
 63 |     let column_number = col.to_string();
 64 |     let prompt = prompt_template
 65 |                           .replace("{new_json}", new_json)
 66 |                           .replace("{column_no}", &column_number)
 67 |                           .replace("{hints}", &hints);  
 68 | 
 69 |     // GUC Values for the transformer server
 70 |     let transformer_server_url = guc::get_guc(guc::PgAutoDWGuc::TransformerServerUrl).ok_or("GUC: Transformer Server URL is not set.")?;
 71 |     let transformer_server_token = guc::get_guc(guc::PgAutoDWGuc::TransformerServerToken).ok_or("GUC: Transformer Server Token is not set.")?;
 72 | 
 73 |     let model = guc::get_guc(guc::PgAutoDWGuc::Model).ok_or("MODEL GUC is not set.")?;
 74 |     
 75 |     let json_type = String::from("json_object");
 76 |     let response_format = ResponseFormat { r#type: json_type,};
 77 | 
 78 |     let temperature: f64 = 0.75;
 79 | 
 80 |     let role = String::from("user");
 81 | 
 82 |     let message = Message {
 83 |         role,
 84 |         content: prompt,
 85 |     };
 86 | 
 87 |     let messages = vec![message];
 88 | 
 89 |     let request = Request {
 90 |         model,
 91 |         messages,
 92 |         temperature,
 93 |         response_format,
 94 |     };
 95 | 
 96 |     log!("Request: {:#?}", request.messages[0]);
 97 | 
 98 |     let raw_response = client
 99 |         .post(&transformer_server_url)
100 |         .header("Authorization", format!("Bearer {}", transformer_server_token))
101 |         .header("Content-Type", "application/json")
102 |         .json(&request)
103 |         .send()
104 |         .await?;
105 | 
106 |     // Clone the body into a string for logging before deserializing
107 |     let raw_body = raw_response.text().await?;
108 |     log!("Raw response body: {}", raw_body);
109 | 
110 |     // Attempt to deserialize the response from the raw body
111 |     let response: Response = match serde_json::from_str(&raw_body) {
112 |         Ok(parsed_response) => parsed_response,
113 |         Err(e) => {
114 |             log!("Error parsing response as type `Response`: {}", e);
115 |             return Err(Box::new(e));
116 |         }
117 |     };
118 | 
119 |     // Extract the content string
120 |     let content_str = &response
121 |         .choices
122 |         .get(0)
123 |         .ok_or("No choices in response")?
124 |         .message
125 |         .content;
126 | 
127 |     log!("Response: {:#?}", content_str);
128 | 
129 |     // Parse the content string into serde_json::Value
130 |     let content_json: serde_json::Value = serde_json::from_str(content_str)?;
131 | 
132 |     Ok(content_json)
133 | }
134 | 
135 | 
136 | 
137 | 
138 | 


--------------------------------------------------------------------------------
/extension/src/utility/setup.rs:
--------------------------------------------------------------------------------
1 | use pgrx::prelude::*;
2 | 
3 | extension_sql_file!("sql/info_tables.sql");
4 | 


--------------------------------------------------------------------------------
/extension/src/utility/sql/info_tables.sql:
--------------------------------------------------------------------------------
 1 | DROP TABLE IF EXISTS source_objects;
 2 | 
 3 | CREATE TABLE IF NOT EXISTS source_objects
 4 | (
 5 |     pk_source_objects bigserial PRIMARY KEY,
 6 | 	schema_oid oid,
 7 |     schema_name name,
 8 |     schema_description text,
 9 | 	table_oid oid,
10 |     table_name name,
11 |     table_description text,
12 |     column_ordinal_position smallint,
13 |     column_name name,
14 |     column_base_type_name name,
15 |     column_modification_number integer,
16 |     column_type_name text,
17 |     column_description text,
18 | 	column_pk_ind INT DEFAULT 0,
19 | 	column_pk_name name,
20 | 	column_fk_ind INT DEFAULT 0,
21 | 	column_dw_flag CHAR(1) DEFAULT 'N',
22 |     valid_from timestamp without time zone DEFAULT (now() AT TIME ZONE 'UTC'), -- Default to current GMT timestamp
23 |     valid_to timestamp without time zone,  -- End of validity period
24 |     current_flag CHAR(1) DEFAULT 'Y',   -- Indicator of current record
25 | 	deleted_flag CHAR(1) DEFAULT 'N'
26 | );
27 | 
28 | DROP TABLE IF EXISTS dw_source_objects;
29 | 
30 | CREATE TABLE IF NOT EXISTS dw_source_objects
31 | (
32 |     pk_dw_source_objects BIGSERIAL PRIMARY KEY,
33 |     table_oid OID,
34 |     column_ordinal_position SMALLINT
35 | );
36 | 
37 | DROP TABLE IF EXISTS auto_dw.transformer_responses;
38 | 
39 | CREATE TABLE IF NOT EXISTS transformer_responses
40 | (
41 |     pk_transformer_responses BIGSERIAL PRIMARY KEY,
42 |     fk_source_objects BIGINT,
43 |     model_name TEXT,
44 |     category TEXT,
45 |     business_key_name TEXT,
46 |     confidence_score NUMERIC(3, 2),
47 |     reason TEXT,
48 |     created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT (now() AT TIME ZONE 'UTC'),
49 |     CONSTRAINT fk_source_objects FOREIGN KEY (fk_source_objects) 
50 | 	   	REFERENCES source_objects(pk_source_objects)
51 | 		ON DELETE CASCADE
52 | );
53 | 
54 | DROP TABLE IF EXISTS build_call;
55 | 
56 | CREATE TABLE IF NOT EXISTS build_call
57 | (
58 |     pk_build_call BIGSERIAL PRIMARY KEY,
59 |     fk_transformer_responses BIGINT,
60 |     build_id VARCHAR(100),
61 | 	build_flag VARCHAR(100),
62 |     build_status VARCHAR(100),
63 |     created_at TIMESTAMP WITHOUT TIME ZONE DEFAULT (now() AT TIME ZONE 'UTC'),
64 |     CONSTRAINT fk_transformer_responses FOREIGN KEY (fk_transformer_responses) 
65 | 	   	REFERENCES auto_dw.transformer_responses(pk_transformer_responses)
66 | 		ON DELETE CASCADE
67 | );
68 | 
69 | DROP TABLE IF EXISTS dv_repo;
70 | 
71 | CREATE TABLE dv_repo (
72 |     build_id TEXT,
73 |     insert_time  TIMESTAMP WITHOUT TIME ZONE DEFAULT (now() AT TIME ZONE 'UTC'),
74 |     schema JSON
75 | );
76 | 
77 | DROP TABLE IF EXISTS log;
78 | 
79 | CREATE TABLE log (
80 |     pk_log BIGSERIAL PRIMARY KEY,      
81 |     log_ts TIMESTAMP WITHOUT TIME ZONE DEFAULT (NOW() AT TIME ZONE 'UTC'),
82 |     process VARCHAR(50),
83 | 	level VARCHAR(50),
84 |     message TEXT     
85 | );
86 | 


--------------------------------------------------------------------------------
/extension/src/utility/transformer_client.rs:
--------------------------------------------------------------------------------
 1 | use crate::model::prompt_template::PromptTemplate;
 2 | use super::{guc, openai_client, ollama_client};
 3 | use TransformerServerType::{OpenAI, Ollama};
 4 | use std::str::FromStr;
 5 | 
 6 | pub enum TransformerServerType {
 7 |     OpenAI,
 8 |     Ollama
 9 | }
10 | 
11 | impl FromStr for TransformerServerType {
12 |     type Err = &'static str;
13 | 
14 |     fn from_str(s: &str) -> Result<TransformerServerType, Self::Err> {
15 |         match s.to_lowercase().as_str() {
16 |             "openai" => Ok(OpenAI),
17 |             "ollama" => Ok(Ollama),
18 |             _ => Err("Invalid Transformer Server Type"),
19 |         }
20 |     }
21 | }
22 | 
23 | pub async fn send_request(new_json: &str, template_type: PromptTemplate, col: &u32, hints: &str)  -> Result<serde_json::Value, Box<dyn std::error::Error>>  {
24 |     
25 |         let transformer_server_type_str = guc::get_guc(guc::PgAutoDWGuc::TransformerServerType).ok_or("GUC: Transformer Server Type is not set.")?;
26 |         
27 |         let transformer_server_wait_duration = guc::get_guc(guc::PgAutoDWGuc::TransformerServerWaitDuration).ok_or("GUC: Transformer Server Wait Duration is not set.")?;
28 |         let timout_in_sec: u64 = transformer_server_wait_duration.parse().expect("TransformerServerMaxRetries Not Valid u64");
29 | 
30 |         let transformer_server_type = transformer_server_type_str.parse::<TransformerServerType>()
31 |             .map_err(|e| format!("Error parsing Transformer Server Type: {}", e))?;
32 | 
33 |     match transformer_server_type {
34 |         OpenAI => openai_client::send_request(new_json, template_type, col, hints, timout_in_sec).await,
35 |         Ollama => ollama_client::send_request(new_json, template_type, col, hints, timout_in_sec).await,
36 |     }
37 | }
38 | 
39 | 


--------------------------------------------------------------------------------