├── .gitignore ├── LICENSE ├── README.md ├── cloudbuild.json ├── dataform.json ├── definitions ├── example.js └── source_data.sqlx ├── index.js ├── package-lock.json └── package.json /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | .df-credentials.json 3 | node_modules/ 4 | .idea/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Dataform 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Common data models for creating [type-2 slowly changing dimensions tables](https://en.wikipedia.org/wiki/Slowly_changing_dimension) from mutable data sources in [Dataform](https://github.com/dataform-co/dataform). 2 | 3 | ## Supported warehouses 4 | 5 | - BigQuery 6 | - Redshift/PG 7 | - Snowflake 8 | 9 | _If you would like us to add support for another warehouse, please get in touch via [email](mailto:team@dataform.co) or [Slack](https://slack.dataform.co/)_ 10 | 11 | ## Installation 12 | 13 | Add the package to your `package.json` file in your Dataform project. You can find the most up to package version on the [releases page](https://github.com/dataform-co/dataform-scd/releases). 14 | 15 | ## Configure the package 16 | 17 | Create a new JS file in your `definitions/` folder create an SCD table with the following example: 18 | 19 | ```js 20 | const scd = require("dataform-scd"); 21 | 22 | scd("source_data_scd", { 23 | // A unique identifier for rows in the table. 24 | uniqueKey: "user_id", 25 | // A field that stores a timestamp or date of when the row was last changed. 26 | timestamp: "updated_at", 27 | // A field that stores the hash value of the fields that we want to track changes in. If you do not want to use the hash comparison, you may omit this field or set it to null 28 | hash: "hash_value", // OPTIONAL 29 | // The source table to build slowly changing dimensions from. 30 | source: { 31 | schema: "dataform_scd_example", 32 | name: "source_data", 33 | }, 34 | // Any configuration parameters to apply to the incremental table that will be created. 35 | incrementalConfig: { 36 | bigquery: { 37 | partitionBy: "updated_at", 38 | }, 39 | }, 40 | }); 41 | ``` 42 | 43 | For more advanced customization of outputs, see the [example.js](https://github.com/dataform-co/dataform-scd/blob/master/definitions/example.js). 44 | 45 | ### Scheduling 46 | 47 | Slowly changing dimensions can only by updated as quickly as these models are run. These models should typically be scheduled to run every day or every hour, depending on the granularity of changes you want to capture. 48 | 49 | ### Hash comparison option 50 | 51 | Depending on your data update method, you may want to use the hash field option to compare rows on each execution and only add the ones that have been changed or added. To do this, please make sure your table contains a hash field created using the hash function of your choice. You can find a list of the hash functions available in BigQuery [here](https://cloud.google.com/bigquery/docs/reference/standard-sql/hash_functions). On each incremental run, the query will compare the hashes for each unique identifier to the ones in the updated table. It will only keep the rows where the hash has changed or where the row ID is not found in the current data. 52 | 53 | If you do not want to use the hash comparison, simply omit the hash parameter from the config file or set it to `null`. If you do this, all rows with an updated timestamp will be added to the `{name}_updates` table, even if the data did not otherwise change. 54 | 55 | ## Data models 56 | 57 | This package will create two relations in the warehouse, for a given `name` these will be: 58 | 59 | - `{name}` - a view with `scd_valid_from` and `scd_valid_to` fields 60 | - `{name}_updates` - an incremental table that stores the change history of the source table 61 | -------------------------------------------------------------------------------- /cloudbuild.json: -------------------------------------------------------------------------------- 1 | { 2 | "steps": [ 3 | { 4 | "name": "docker.io/node", 5 | "entrypoint": "npm", 6 | "args": ["install"] 7 | }, 8 | { 9 | "name": "docker.io/dataformco/dataform:2.8.2", 10 | "args": ["compile"] 11 | } 12 | ] 13 | } 14 | -------------------------------------------------------------------------------- /dataform.json: -------------------------------------------------------------------------------- 1 | { 2 | "defaultSchema": "dataform_scd_example", 3 | "warehouse": "bigquery", 4 | "defaultDatabase": "tada-analytics", 5 | "defaultLocation": "us" 6 | } -------------------------------------------------------------------------------- /definitions/example.js: -------------------------------------------------------------------------------- 1 | const scd = require("../index"); 2 | 3 | /** 4 | * Create an SCD table on top of the fake table defined in source_data.sqlx. 5 | */ 6 | const { updates, view } = scd("source_data_scd", { 7 | // A unique identifier for rows in the table. 8 | uniqueKey: "user_id", 9 | // A field that stores a timestamp or date of when the row was last changed. 10 | timestamp: "updated_at", 11 | // A field that stores the hash value of the fields that we want to track changes in. If you do not want to use the hash comparison, you may omit this field or set it to null 12 | hash: "hash_value", 13 | // The source table to build slowly changing dimensions from. 14 | source: { 15 | schema: "dataform_scd_example", 16 | name: "source_data", 17 | }, 18 | // Any tags that will be added to actions. 19 | tags: ["slowly-changing-dimensions"], 20 | // Optional documentation of table columns 21 | columns: {user_id: "User ID", some_field: "Data Field", hash_value: "Hash of all fields to compare",updated_at: "Timestamp for updates"}, 22 | // Any configuration parameters to apply to the incremental table that will be created. 23 | incrementalConfig: { 24 | bigquery: { 25 | partitionBy: "updated_at", 26 | }, 27 | }, 28 | }); 29 | 30 | // Additional customization of the created models can be done by using the returned actions objects. 31 | updates.config({ 32 | // You can specify the output schema here if it is different than the default 33 | schema: "dataform_scd_example", 34 | description: "Updates table for SCD", 35 | }); 36 | -------------------------------------------------------------------------------- /definitions/source_data.sqlx: -------------------------------------------------------------------------------- 1 | config { 2 | type: "view" 3 | } 4 | 5 | WITH example_dataset AS ( 6 | SELECT 7 | user_id, 8 | field_a, 9 | field_b, 10 | updated_at 11 | FROM 12 | ( 13 | SELECT 14 | 1 as user_id, 15 | 'b' AS field_a, 16 | 12.3 AS field_b, 17 | date_add(current_date(), interval 1 day) AS updated_at 18 | ) 19 | UNION ALL 20 | ( 21 | SELECT 22 | 2 as user_id, 23 | 'c' AS field_a, 24 | 23.4 AS field_b, 25 | date_add(current_date(), interval 0 day) AS updated_at 26 | ) 27 | ) 28 | SELECT 29 | *, 30 | md5(concat( 31 | field_a, 32 | cast(field_b AS string) 33 | ) 34 | ) AS hash_value 35 | FROM 36 | example_dataset -------------------------------------------------------------------------------- /index.js: -------------------------------------------------------------------------------- 1 | /** 2 | * Builds a type-2 slowly changing dimensions table and view. 3 | */ 4 | module.exports = ( 5 | name, 6 | { uniqueKey, hash, timestamp, source, tags, incrementalConfig, columns = {} } 7 | ) => { 8 | // Create an incremental table with just pure updates, for a full history of the table. 9 | const updates = publish(`${name}_updates`, { 10 | type: "incremental", 11 | tags, 12 | columns, 13 | ...incrementalConfig, 14 | }).query( 15 | !!hash ? 16 | (ctx) => ` 17 | ${ctx.when( 18 | ctx.incremental(), `with ids_to_update as \ 19 | (select ${uniqueKey}, ${hash} from ${ctx.ref(source)}\ 20 | except distinct \ 21 | (select ${uniqueKey}, ${hash} from ${ctx.self()}))` 22 | )} 23 | select * from ${ctx.ref(source)} 24 | ${ctx.when( 25 | ctx.incremental(), 26 | `where ${timestamp} > (select max(${timestamp}) from ${ctx.self()}) 27 | and ${uniqueKey} in (select ${uniqueKey} from ids_to_update)` 28 | )}` 29 | : 30 | (ctx) => ` 31 | select * from ${ctx.ref(source)} 32 | ${ctx.when( 33 | ctx.incremental(), 34 | `where ${timestamp} > (select max(${timestamp}) from ${ctx.self()})` 35 | )}` 36 | ); 37 | 38 | 39 | // Create a view on top of the raw updates table that contains computed valid_from and valid_to fields. 40 | const view = publish(name, { 41 | type: "view", 42 | tags, 43 | columns: { 44 | ...columns, 45 | scd_valid_from: `The timestamp from which this row is valid for the given ${uniqueKey}.`, 46 | scd_valid_to: `The timestamp until which this row is valid for the given ${uniqueKey}, or null if this it the latest value.`, 47 | }, 48 | }).query( 49 | (ctx) => ` 50 | select 51 | *, 52 | ${timestamp} as scd_valid_from, 53 | lead(${timestamp}) over (partition by ${uniqueKey} order by ${timestamp} asc) as scd_valid_to 54 | from 55 | ${ctx.ref(updates.proto.target.schema, `${name}_updates`)} 56 | ` 57 | ); 58 | 59 | // Returns the tables so they can be customized. 60 | return { view, updates }; 61 | }; -------------------------------------------------------------------------------- /package-lock.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataform-scd", 3 | "lockfileVersion": 3, 4 | "requires": true, 5 | "packages": { 6 | "": { 7 | "name": "dataform-scd", 8 | "dependencies": { 9 | "@dataform/core": "2.8.2" 10 | } 11 | }, 12 | "node_modules/@dataform/core": { 13 | "version": "2.8.2", 14 | "resolved": "https://registry.npmjs.org/@dataform/core/-/core-2.8.2.tgz", 15 | "integrity": "sha512-VdlBEk4g8YUGgXAxEYvUzQ+bbYyf3/XLcFgwH9TntpE2K5cXdAz2DIPNRbmH0yjmaePHC9w9WDXzpLz+fJucCw==" 16 | } 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /package.json: -------------------------------------------------------------------------------- 1 | { 2 | "name": "dataform-scd", 3 | "dependencies": { 4 | "@dataform/core": "2.8.2" 5 | } 6 | } 7 | --------------------------------------------------------------------------------