├── .github └── CODEOWNERS ├── .gitignore ├── License.md ├── README.md ├── bq-incrementals ├── .gitignore ├── data │ ├── seed_cov_bond.csv │ ├── seed_defs_op.csv │ └── seed_oss.csv ├── dbt_project.yml ├── macros │ └── get_last_3d.sql ├── models │ ├── insert_overwrite_dynamic │ │ ├── iod_aggregated.sql │ │ ├── iod_enriched.sql │ │ └── iod_goldilocks.sql │ ├── insert_overwrite_static │ │ ├── ios_aggregated.sql │ │ ├── ios_enriched.sql │ │ └── ios_goldilocks.sql │ ├── merge │ │ ├── m_aggregated.sql │ │ ├── m_enriched.sql │ │ └── m_goldilocks.sql │ ├── merge_clustered │ │ ├── mc_aggregated.sql │ │ ├── mc_enriched.sql │ │ └── mc_goldilocks.sql │ ├── pages_of_interest.sql │ └── wikipedia_source.yml └── packages.yml ├── business-hours ├── .gitignore ├── README.md ├── analysis │ └── .gitkeep ├── dbt_project.yml ├── macros │ ├── .gitkeep │ ├── attempt-1-macros │ │ └── business_time_functions.sql │ └── attempt-2-subquery │ │ └── business_time_functions.sql ├── models │ ├── all_business_hours.sql │ └── fct_support_tickets.sql ├── packages.yml ├── seeds │ ├── .gitkeep │ └── sample_tickets.csv ├── snapshots │ └── .gitkeep └── tests │ └── .gitkeep ├── dynamic-data-masking-redshift ├── README.md ├── data │ └── employees.csv ├── dbt_project.yml ├── macros │ └── apply_data_masking.sql ├── models │ └── employees_with_masking.sql └── packages.yml ├── insert_by_period ├── .gitignore ├── Makefile ├── README.md ├── dbt_project.yml ├── integration_tests │ ├── .gitignore │ ├── ci │ │ └── sample.profiles.yml │ ├── data │ │ ├── data_insert_by_period.csv │ │ └── data_insert_by_period_overwrite.csv │ ├── dbt_project.yml │ ├── macros │ │ ├── .gitkeep │ │ ├── assert_equal_values.sql │ │ ├── limit_zero.sql │ │ └── tests.sql │ ├── models │ │ ├── expected_insert_by_period.sql │ │ ├── expected_insert_by_period_overwrite.sql │ │ ├── schema.yml │ │ └── test_insert_by_period.sql │ └── packages.yml └── macros │ ├── create_relation_for_insert_by_period.sql │ ├── get_period_boundaries.sql │ ├── get_period_sql.sql │ ├── get_rows_inserted.sql │ └── insert_by_period_materialization.sql ├── lambda-views ├── .gitignore ├── README.md ├── data │ └── .gitkeep ├── dbt_project.yml ├── etc │ ├── option-1-dag.png │ └── option-2-dag.png ├── macros │ ├── .gitkeep │ ├── lambda │ │ ├── lambda_filter.sql │ │ └── lambda_union.sql │ └── models │ │ ├── page_views_model_sql.sql │ │ └── sessions_model_sql.sql └── models │ ├── option_1 │ ├── page_views.sql │ ├── page_views__lambda_current.sql │ ├── page_views__lambda_historical.sql │ ├── sessions.sql │ ├── sessions__lambda_current.sql │ └── sessions__lambda_historical.sql │ ├── option_2 │ ├── page_views.sql │ ├── page_views__lambda_historical.sql │ ├── sessions.sql │ └── sessions__lambda_historical.sql │ ├── sources.yml │ └── thought_experiment │ ├── page_views.sql │ └── sessions.sql ├── materialized-views ├── .gitignore ├── README.md ├── dbt_project.yml ├── integration_tests │ ├── .gitignore │ ├── Makefile │ ├── dbt_project.yml │ ├── macros │ │ └── overrides.sql │ ├── models │ │ ├── base_tbl.sql │ │ ├── schema.yml │ │ ├── test_mv_auto.sql │ │ └── test_mv_manual.sql │ ├── packages.yml │ └── seed │ │ ├── expected.csv │ │ ├── seed.csv │ │ └── seed_update.csv └── macros │ ├── bigquery │ ├── adapters.sql │ └── materialized_view.sql │ ├── default │ ├── adapters.sql │ └── materialized_view.sql │ ├── postgres │ └── adapters.sql │ ├── redshift │ └── adapters.sql │ └── snowflake │ ├── adapters.sql │ └── materialized_view.sql ├── read-external-iceberg ├── .gitignore ├── README.md ├── dbt_project.yml ├── macros │ └── plugins │ │ └── snowflake │ │ ├── create_iceberg_source.sql │ │ └── get_external_build_plan.sql └── packages.yml └── snapshot-testing ├── .gitignore ├── README.md ├── data ├── .gitkeep └── fct_orders.csv ├── dbt_project.yml ├── macros ├── .gitkeep ├── historic_revenue_snapshot_cleanup.sql └── test_is_null.sql ├── snapshots ├── .gitkeep ├── historic_revenue_snapshot.sql └── schema.yml └── tests └── .gitkeep /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @dbt-labs/core-team 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_modules/ 3 | logs/ -------------------------------------------------------------------------------- /License.md: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "{}" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright {yyyy} {name of copyright owner} 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # dbt Labs: Experimental Features 2 | 3 | This repository includes projects that extend of existing dbt features, experiment with new database features not yet natively supported in dbt, or otherwise demonstrate cool stuff you can do with just Jinja macros in your project—no forks necessary. 4 | 5 | In all cases, these are _demo_ projects, not intended as ready-to-use packages. If you want to use code from this repository in your own project, you're more than welcome to clone and install as a [local package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management/#local-packages), or just copy-paste :) 6 | 7 | ## [BigQuery Incremental Strategies](bq-incrementals) 8 | 9 | * These features shipped in dbt v0.16.0! See [changelog](https://github.com/fishtown-analytics/dbt/blob/dev/octavius-catto/CHANGELOG.md#features-4) and [docs](https://docs.getdbt.com/docs/building-a-dbt-project/building-models/bigquery-configs/#merge-behavior-incremental-models) 10 | * The [project here](bq_incrementals) provided the substrate for a [discourse post](https://discourse.getdbt.com/t/981) benchmarking different incremental strategies on BigQuery 11 | 12 | ## [Materialized views](materialized-views) 13 | 14 | This project adds support for `materialized_view` as a new dbt materialization. It includes implementations for Postgres, Redshift, Snowflake, and BigQuery, through a mix of new macros and overrides of built-in dbt macros. See the [project README](materialized-views/README.md) for details. For another take on dbt + materialized views, check out the [dbt-materialize](https://github.com/MaterializeInc/materialize/tree/main/misc/dbt-materialize#dbt-materialize) plugin. 15 | 16 | ## [Lambda views](lambda-views) 17 | This lab demonstrates a number of options for lambda views, as discussed in this [discourse article](https://discourse.getdbt.com/t/how-to-create-near-real-time-models-with-just-dbt-sql/1457/3). Additional details about the various approaches can be found in at [lambda-views/README.md](lambda-views/README.md). 18 | 19 | ## [Snapshot testing](snapshot-testing) 20 | This lab demonstrates how to use snapshots to detect dbt model regressions, as discussed in this [discourse article](https://discourse.getdbt.com/t/build-snapshot-based-tests-to-detect-regressions-in-historic-data/1478). Additional details on how to test this code for yourself can be found at [snapshot-testing/README.md](snapshot-testing/README.md). 21 | 22 | 23 | ## [Dynamic data masking on Redshift](dynamic-data-masking-redshift) 24 | This lab demonstrates how to implement dynamic data masking on Redshift. 25 | 26 | Check out [this discourse article](https://discourse.getdbt.com/t/how-to-implement-dynamic-data-masking-on-redshift/2043) for more information. 27 | 28 | ## [Time on Task](business_hours) 29 | 30 | This lab demonstrates two strategies for measuring Time on Task. 31 | 32 | Check out [this devhub article](https://docs.getdbt.com/blog/measuring-business-hours-sql-time-on-task) for more information. 33 | 34 | ## Resources: 35 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 36 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 37 | - Join the [chat](http://community.getdbt.com/) on Slack for live discussions and support 38 | - Find [dbt events](https://events.getdbt.com) near you 39 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 40 | -------------------------------------------------------------------------------- /bq-incrementals/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /bq-incrementals/data/seed_cov_bond.csv: -------------------------------------------------------------------------------- 1 | lang,title 2 | af,"Kovalente_binding" 3 | ar,"رابطة_تساهمية" 4 | ast,"Enllaz_covalente" 5 | az,"Kovalent_əlaqə" 6 | be,"Кавалентная_сувязь" 7 | bg,"Ковалентна_връзка" 8 | bn,"সমযোজী_বন্ধন" 9 | bs,"Kovalentna_veza" 10 | ca,"Enllaç_covalent" 11 | ckb,"بەندی_کوالانسی" 12 | cs,"Kovalentní_vazba" 13 | cy,"Bond_cofalent" 14 | da,"Kovalent_binding" 15 | de,"Kovalente_Bindung" 16 | el,"Ομοιοπολικός_δεσμός" 17 | en,"Covalent_bond" 18 | eo,"Kovalenta_ligo" 19 | es,"Enlace_covalente" 20 | et,"Kovalentne_side" 21 | eu,"Lotura_kobalente" 22 | fa,"پیوند_کووالانسی" 23 | fi,"Kovalenttinen_sidos" 24 | fr,"Liaison_covalente" 25 | gl,"Enlace_covalente" 26 | gv,"Kiangley_cofioosagh" 27 | he,"קשר_קוולנטי" 28 | hi,"सहसंयोजी_आबंध" 29 | hr,"Kovalentna_veza" 30 | ht,"Lyezon_kovalan" 31 | hu,"Kovalens_kötés" 32 | id,"Ikatan_kovalen" 33 | it,"Legame_covalente" 34 | ja,"共有結合" 35 | jv,"Ikatan_Kovalen" 36 | kab,"Turza_tattekkant" 37 | ka,"კოვალენტური_ბმა" 38 | kk,"Коваленттік_байланыс" 39 | km,"សម្ព័ន្ធកូវ៉ាឡង់" 40 | ko,"공유_결합" 41 | lt,"Kovalentinis_ryšys" 42 | lv,"Kovalentā_saite" 43 | mk,"Ковалентна_врска" 44 | ml,"സഹസംയോജകബന്ധനം" 45 | mr,"सहसंयुज_बंध" 46 | ms,"Ikatan_kovalen" 47 | nl,"Covalente_binding" 48 | nn,"Kovalent_binding" 49 | no,"Kovalent_binding" 50 | oc,"Ligam_covalent" 51 | pa,"ਸਹਿਯੋਜਕੀ_ਜੋੜ" 52 | pl,"Wiązanie_kowalencyjne" 53 | pnb,"کوویلنٹ_جوڑ" 54 | pt,"Ligação_covalente" 55 | ro,"Legătură_covalentă" 56 | rue,"Ковалентна_вязба" 57 | ru,"Ковалентная_связь" 58 | sco,"Covalent_bond" 59 | sh,"Kovalentna_veza" 60 | simple,"Covalent_bond" 61 | si,"සහසංයුජ_බන්ධනය" 62 | sk,"Kovalentná_väzba" 63 | sl,"Kovalentna_vez" 64 | sq,"Lidhja_kovalente" 65 | sr,"Ковалентна_веза" 66 | su,"Beungkeut_kovalén" 67 | sv,"Kovalent_bindning" 68 | ta,"சகப்_பிணைப்பு" 69 | th,"พันธะโคเวเลนต์" 70 | tr,"Kovalent_bağ" 71 | tt,"Ковалент_бәйләнеш" 72 | uk,"Ковалентний_зв'язок" 73 | ur,"کوویلنٹ_بونڈ" 74 | vi,"Liên_kết_cộng_hóa_trị" 75 | wuu,"共价键" 76 | yi,"קאוואלענטער_בונד" 77 | yo,"Ìsopọ̀_àjọfagbáradìmú" 78 | zh_classical,"共價鍵" 79 | zh_min_nan,"Kiōng-iú_kiat-ha̍p" 80 | zh_yue,"共價鍵" 81 | zh,"共价键" -------------------------------------------------------------------------------- /bq-incrementals/data/seed_defs_op.csv: -------------------------------------------------------------------------------- 1 | lang,title 2 | be,"Пражскія_дэфенестрацыі" 3 | bg,"Пражка_дефенестрация" 4 | br,"Difrenestriñ_Praha" 5 | ca,"Defenestració_de_Praga" 6 | en,"Defenestrations_of_Prague" 7 | es,"Defenestraciones_de_Praga" 8 | fr,"Défenestration_de_Prague" 9 | gl,"Defenestracións_de_Praga" 10 | hr,"Praška_defenestracija" 11 | id,"Pelemparan_di_Praha" 12 | ja,"プラハ窓外放出事件" 13 | ka,"პრაღის_დეფენესტრაციები" 14 | ko,"프라하_창밖_투척사건" 15 | la,"Defenestratio_Pragensis" 16 | ms,"Defenestratio_Pragensis" 17 | no,"Defenestrasjonene_i_Praha" 18 | pl,"Defenestracja_praska" 19 | pt,"Defenestrações_de_Praga" 20 | ro,"Defenestrațiile_de_la_Praga" 21 | ru,"Пражские_дефенестрации" 22 | scn,"Difinistrazzioni_di_Praga" 23 | sh,"Praška_defenestracija" 24 | sl,"Praška_defenestracija" 25 | sr,"Прашка_дефенестрација" 26 | sv,"Defenestrationerna_i_Prag" 27 | uk,"Празька_дефенестрація" 28 | vec,"Desfenestrazion_de_Praga" 29 | zh,"布拉格拋窗事件" 30 | -------------------------------------------------------------------------------- /bq-incrementals/data/seed_oss.csv: -------------------------------------------------------------------------------- 1 | lang,title 2 | af,"Oopbronsagteware" 3 | ar,"برمجيات_مفتوحة_المصدر" 4 | ast,"Software_de_códigu_abiertu" 5 | bg,"Софтуер_с_отворен_код" 6 | bn,"উন্মুক্ত-উৎসের_সফটওয়্যার" 7 | bs,"Softver_otvorenog_koda" 8 | ckb,"نەرمامێری_سەرچاوە_کراوە" 9 | cs,"Otevřený_software" 10 | el,"Λογισμικό_ανοικτού_κώδικα" 11 | en,"Open-source_software" 12 | eo,"Malfermkoda_programaro" 13 | es,"Software_de_código_abierto" 14 | et,"Avatud_lähtekoodiga_tarkvara" 15 | fa,"نرم‌افزار_متن‌باز" 16 | fy,"Open-source_software" 17 | gl,"Software_de_código_aberto" 18 | he,"קוד_פתוח" 19 | hi,"मुक्त_स्रोत_सॉफ्टवेयर" 20 | hu,"Nyílt_forráskódú_szoftver" 21 | id,"Perangkat_lunak_sumber_terbuka" 22 | is,"Opinn_hugbúnaður" 23 | ja,"オープンソースソフトウェア" 24 | ko,"오픈_소스_소프트웨어" 25 | ky,"Ачык_булактуу_програмдык_камсыздоо" 26 | la,"Programma_fontium_apertorum" 27 | lt,"Atvirojo_kodo_programa" 28 | ml,"ഓപ്പൺ_സോഴ്സ്_സോഫ്റ്റ്‌വെയർ" 29 | mr,"मुक्त_स्रोत" 30 | ms,"Perisian_sumber_terbuka" 31 | nl,"Opensourcesoftware" 32 | or,"ଓପନ-ସୋର୍ସ_ସଫ୍ଟୱେର" 33 | pa,"ਖੁੱਲ੍ਹਾ-ਸਰੋਤ_ਸਾਫ਼ਟਵੇਅਰ" 34 | pl,"Otwarte_oprogramowanie" 35 | pt,"Software_de_código_aberto" 36 | ro,"Software_cu_sursă_deschisă" 37 | ru,"Открытое_программное_обеспечениеs" 38 | cn,"Open_source" 39 | sh,"Otvoreni_softver" 40 | si,"විවෘත_කේත_මෘදුකාංග" 41 | sk,"Open-source_softvér" 42 | sl,"Odprtokodna_programska_oprema" 43 | sq,"Softuerët_me_burim_të_hapur" 44 | sr,"Softver_otvorenog_koda" 45 | ta,"திறந்த_மூல_மென்பொருள்te ఓపెన్_సోర్సు_సాఫ్ట్​వేర్" 46 | th,"ซอฟต์แวร์โอเพนซอร์ซ" 47 | uk,"Відкрите_програмне_забезпечення" 48 | uz,"Ochiq_manbali_dastur" 49 | vi,"Phần_mềm_nguồn_mở" 50 | wuu,"开源软件" 51 | zh_min_nan,"Khai-goân_nńg-thé" 52 | zh,"开源软件" 53 | -------------------------------------------------------------------------------- /bq-incrementals/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | name: 'bq_incremental_testing' 3 | version: '0.1.0' 4 | config-version: 2 5 | 6 | profile: 'garage-bigquery' 7 | 8 | source-paths: ["models"] 9 | analysis-paths: ["analysis"] 10 | test-paths: ["tests"] 11 | data-paths: ["data"] 12 | macro-paths: ["macros"] 13 | 14 | target-path: "target" 15 | clean-targets: 16 | - "target" 17 | - "dbt_modules" 18 | 19 | require-dbt-version: ">=0.16.0" 20 | 21 | models: 22 | vars: 23 | old: 3 24 | new: 1 25 | -------------------------------------------------------------------------------- /bq-incrementals/macros/get_last_3d.sql: -------------------------------------------------------------------------------- 1 | {% macro get_last_3d() %} 2 | 3 | {% set partitions = [] %} 4 | 5 | {% set max_d_ago = var('new') + 1 %} 6 | 7 | {% for i in range(1, max_d_ago) %} 8 | {% set this_partition %} date_sub(current_date, interval -{{i}} day) {% endset %} 9 | {% do partitions.append(this_partition) %} 10 | {% endfor %} 11 | 12 | {% do return(partitions) %} 13 | 14 | {% endmacro %} 15 | -------------------------------------------------------------------------------- /bq-incrementals/models/insert_overwrite_dynamic/iod_aggregated.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'}, 5 | incremental_strategy = 'insert_overwrite' 6 | )}} 7 | 8 | with page_views as ( 9 | 10 | select * from {{source('wikipedia', 'pageviews_2020')}} 11 | 12 | {% if is_incremental() %} 13 | -- always rebuild up to the current day 14 | where date(datehour) >= date_sub(date(_dbt_max_partition), interval ({{var('new')}}) day) 15 | and date(datehour) < current_date 16 | {% else %} 17 | -- this source table requires a partition filter regardless 18 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 19 | and date(datehour) < current_date 20 | {% endif %} 21 | 22 | ), 23 | 24 | pages_of_interest as ( 25 | 26 | select * from {{ref('pages_of_interest')}} 27 | 28 | ), 29 | 30 | parsed as ( 31 | 32 | select *, 33 | 34 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 35 | 36 | from page_views 37 | 38 | ), 39 | 40 | tagged as ( 41 | 42 | select * from parsed 43 | left join pages_of_interest using (title, lang) 44 | 45 | ), 46 | 47 | agg as ( 48 | 49 | select 50 | 51 | datehour as date_hour, 52 | subject, 53 | lang, 54 | sum(views) as total_views 55 | 56 | from tagged 57 | group by 1,2,3 58 | 59 | ), 60 | 61 | final as ( 62 | 63 | select 64 | 65 | {{ dbt_utils.surrogate_key('date_hour', 'subject', 'lang') }} as id, 66 | * 67 | 68 | from agg 69 | 70 | ) 71 | 72 | select * from final 73 | -------------------------------------------------------------------------------- /bq-incrementals/models/insert_overwrite_dynamic/iod_enriched.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'}, 5 | incremental_strategy = 'insert_overwrite' 6 | )}} 7 | 8 | with page_views as ( 9 | 10 | select * from {{source('wikipedia', 'pageviews_2020')}} 11 | 12 | {% if is_incremental() %} 13 | -- always rebuild up to the current day 14 | where date(datehour) >= date_sub(date(_dbt_max_partition), interval ({{var('new')}}) day) 15 | and date(datehour) < current_date 16 | {% else %} 17 | -- this source table requires a partition filter regardless 18 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 19 | and date(datehour) < current_date 20 | {% endif %} 21 | 22 | ), 23 | 24 | pages_of_interest as ( 25 | 26 | select * from {{ref('pages_of_interest')}} 27 | 28 | ), 29 | 30 | parsed as ( 31 | 32 | select *, 33 | 34 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 35 | 36 | from page_views 37 | 38 | ), 39 | 40 | tagged as ( 41 | 42 | select * from parsed 43 | left join pages_of_interest using (title, lang) 44 | 45 | ), 46 | 47 | agg as ( 48 | 49 | select 50 | 51 | datehour as date_hour, 52 | lang, 53 | title, 54 | subject, 55 | sum(views) as views 56 | 57 | from tagged 58 | group by 1,2,3,4 59 | 60 | ), 61 | 62 | final as ( 63 | 64 | select 65 | 66 | {{ dbt_utils.surrogate_key('date_hour', 'lang', 'title') }} as id, 67 | * 68 | 69 | from agg 70 | 71 | ) 72 | 73 | select * from final 74 | -------------------------------------------------------------------------------- /bq-incrementals/models/insert_overwrite_dynamic/iod_goldilocks.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_day', 'data_type': 'date'}, 5 | incremental_strategy = 'insert_overwrite' 6 | )}} 7 | 8 | with page_views as ( 9 | 10 | select * from {{source('wikipedia', 'pageviews_2020')}} 11 | 12 | {% if is_incremental() %} 13 | -- always rebuild up to the current day 14 | where date(datehour) >= date_sub(_dbt_max_partition, interval ({{var('new')}}) day) 15 | and date(datehour) < current_date 16 | {% else %} 17 | -- this source table requires a partition filter regardless 18 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 19 | and date(datehour) < current_date 20 | {% endif %} 21 | 22 | ), 23 | 24 | pages_of_interest as ( 25 | 26 | select * from {{ref('pages_of_interest')}} 27 | 28 | ), 29 | 30 | parsed as ( 31 | 32 | select *, 33 | 34 | date(datehour) as date_day, 35 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 36 | 37 | from page_views 38 | 39 | ), 40 | 41 | tagged as ( 42 | 43 | select * from parsed 44 | left join pages_of_interest using (title, lang) 45 | 46 | ), 47 | 48 | agg as ( 49 | 50 | select 51 | 52 | date_day, 53 | lang, 54 | title, 55 | subject, 56 | sum(views) as views 57 | 58 | from tagged 59 | group by 1,2,3,4 60 | 61 | ), 62 | 63 | final as ( 64 | 65 | select 66 | 67 | {{ dbt_utils.surrogate_key('date_day', 'lang', 'title') }} as id, 68 | * 69 | 70 | from agg 71 | 72 | ) 73 | 74 | select * from final 75 | -------------------------------------------------------------------------------- /bq-incrementals/models/insert_overwrite_static/ios_aggregated.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'}, 5 | incremental_strategy = 'insert_overwrite', 6 | partitions = get_last_3d() 7 | )}} 8 | 9 | with page_views as ( 10 | 11 | select * from {{source('wikipedia', 'pageviews_2020')}} 12 | 13 | {% if is_incremental() %} 14 | -- always rebuild up to the current day 15 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 16 | and date(datehour) < current_date 17 | {% else %} 18 | -- this source table requires a partition filter regardless 19 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 20 | and date(datehour) < current_date 21 | {% endif %} 22 | 23 | ), 24 | 25 | pages_of_interest as ( 26 | 27 | select * from {{ref('pages_of_interest')}} 28 | 29 | ), 30 | 31 | parsed as ( 32 | 33 | select *, 34 | 35 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 36 | 37 | from page_views 38 | 39 | ), 40 | 41 | tagged as ( 42 | 43 | select * from parsed 44 | left join pages_of_interest using (title, lang) 45 | 46 | ), 47 | 48 | agg as ( 49 | 50 | select 51 | 52 | datehour as date_hour, 53 | subject, 54 | lang, 55 | sum(views) as total_views 56 | 57 | from tagged 58 | group by 1,2,3 59 | 60 | ), 61 | 62 | final as ( 63 | 64 | select 65 | 66 | {{ dbt_utils.surrogate_key('date_hour', 'subject', 'lang') }} as id, 67 | * 68 | 69 | from agg 70 | 71 | ) 72 | 73 | select * from final 74 | -------------------------------------------------------------------------------- /bq-incrementals/models/insert_overwrite_static/ios_enriched.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'}, 5 | incremental_strategy = 'insert_overwrite', 6 | partitions = get_last_3d() 7 | )}} 8 | 9 | with page_views as ( 10 | 11 | select * from {{source('wikipedia', 'pageviews_2020')}} 12 | 13 | {% if is_incremental() %} 14 | -- always rebuild up to the current day 15 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 16 | and date(datehour) < current_date 17 | {% else %} 18 | -- this source table requires a partition filter regardless 19 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 20 | and date(datehour) < current_date 21 | {% endif %} 22 | 23 | ), 24 | 25 | pages_of_interest as ( 26 | 27 | select * from {{ref('pages_of_interest')}} 28 | 29 | ), 30 | 31 | parsed as ( 32 | 33 | select *, 34 | 35 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 36 | 37 | from page_views 38 | 39 | ), 40 | 41 | tagged as ( 42 | 43 | select * from parsed 44 | left join pages_of_interest using (title, lang) 45 | 46 | ), 47 | 48 | agg as ( 49 | 50 | select 51 | 52 | datehour as date_hour, 53 | lang, 54 | title, 55 | subject, 56 | sum(views) as views 57 | 58 | from tagged 59 | group by 1,2,3,4 60 | 61 | ), 62 | 63 | final as ( 64 | 65 | select 66 | 67 | {{ dbt_utils.surrogate_key('date_hour', 'lang', 'title') }} as id, 68 | * 69 | 70 | from agg 71 | 72 | ) 73 | 74 | select * from final 75 | -------------------------------------------------------------------------------- /bq-incrementals/models/insert_overwrite_static/ios_goldilocks.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_day', 'data_type': 'date'}, 5 | incremental_strategy = 'insert_overwrite', 6 | partitions = get_last_3d() 7 | )}} 8 | 9 | with page_views as ( 10 | 11 | select * from {{source('wikipedia', 'pageviews_2020')}} 12 | 13 | {% if is_incremental() %} 14 | -- always rebuild up to the current day 15 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 16 | and date(datehour) < current_date 17 | {% else %} 18 | -- this source table requires a partition filter regardless 19 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 20 | and date(datehour) < current_date 21 | {% endif %} 22 | 23 | ), 24 | 25 | pages_of_interest as ( 26 | 27 | select * from {{ref('pages_of_interest')}} 28 | 29 | ), 30 | 31 | parsed as ( 32 | 33 | select *, 34 | 35 | date(datehour) as date_day, 36 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 37 | 38 | from page_views 39 | 40 | ), 41 | 42 | tagged as ( 43 | 44 | select * from parsed 45 | left join pages_of_interest using (title, lang) 46 | 47 | ), 48 | 49 | agg as ( 50 | 51 | select 52 | 53 | date_day, 54 | lang, 55 | title, 56 | subject, 57 | sum(views) as views 58 | 59 | from tagged 60 | group by 1,2,3,4 61 | 62 | ), 63 | 64 | final as ( 65 | 66 | select 67 | 68 | {{ dbt_utils.surrogate_key('date_day', 'lang', 'title') }} as id, 69 | * 70 | 71 | from agg 72 | 73 | ) 74 | 75 | select * from final 76 | -------------------------------------------------------------------------------- /bq-incrementals/models/merge/m_aggregated.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'} 5 | )}} 6 | 7 | with page_views as ( 8 | 9 | select * from {{source('wikipedia', 'pageviews_2020')}} 10 | 11 | {% if is_incremental() %} 12 | -- always rebuild up to the current day 13 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 14 | and date(datehour) < current_date 15 | {% else %} 16 | -- this source table requires a partition filter regardless 17 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 18 | and date(datehour) < current_date 19 | {% endif %} 20 | 21 | ), 22 | 23 | pages_of_interest as ( 24 | 25 | select * from {{ref('pages_of_interest')}} 26 | 27 | ), 28 | 29 | parsed as ( 30 | 31 | select *, 32 | 33 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 34 | 35 | from page_views 36 | 37 | ), 38 | 39 | tagged as ( 40 | 41 | select * from parsed 42 | left join pages_of_interest using (title, lang) 43 | 44 | ), 45 | 46 | agg as ( 47 | 48 | select 49 | 50 | datehour as date_hour, 51 | subject, 52 | lang, 53 | sum(views) as total_views 54 | 55 | from tagged 56 | group by 1,2,3 57 | 58 | ), 59 | 60 | final as ( 61 | 62 | select 63 | 64 | {{ dbt_utils.surrogate_key('date_hour', 'subject', 'lang') }} as id, 65 | * 66 | 67 | from agg 68 | 69 | ) 70 | 71 | select * from final 72 | -------------------------------------------------------------------------------- /bq-incrementals/models/merge/m_enriched.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'} 5 | )}} 6 | 7 | with page_views as ( 8 | 9 | select * from {{source('wikipedia', 'pageviews_2020')}} 10 | 11 | {% if is_incremental() %} 12 | -- always rebuild up to the current day 13 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 14 | and date(datehour) < current_date 15 | {% else %} 16 | -- this source table requires a partition filter regardless 17 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 18 | and date(datehour) < current_date 19 | {% endif %} 20 | 21 | ), 22 | 23 | pages_of_interest as ( 24 | 25 | select * from {{ref('pages_of_interest')}} 26 | 27 | ), 28 | 29 | parsed as ( 30 | 31 | select *, 32 | 33 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 34 | 35 | from page_views 36 | 37 | ), 38 | 39 | tagged as ( 40 | 41 | select * from parsed 42 | left join pages_of_interest using (title, lang) 43 | 44 | ), 45 | 46 | agg as ( 47 | 48 | select 49 | 50 | datehour as date_hour, 51 | lang, 52 | title, 53 | subject, 54 | sum(views) as views 55 | 56 | from tagged 57 | group by 1,2,3,4 58 | 59 | ), 60 | 61 | final as ( 62 | 63 | select 64 | 65 | {{ dbt_utils.surrogate_key('date_hour', 'lang', 'title') }} as id, 66 | * 67 | 68 | from agg 69 | 70 | ) 71 | 72 | select * from final 73 | -------------------------------------------------------------------------------- /bq-incrementals/models/merge/m_goldilocks.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_day', 'data_type': 'date'} 5 | )}} 6 | 7 | with page_views as ( 8 | 9 | select * from {{source('wikipedia', 'pageviews_2020')}} 10 | 11 | {% if is_incremental() %} 12 | -- always rebuild up to the current day 13 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 14 | and date(datehour) < current_date 15 | {% else %} 16 | -- this source table requires a partition filter regardless 17 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 18 | and date(datehour) < current_date 19 | {% endif %} 20 | 21 | ), 22 | 23 | pages_of_interest as ( 24 | 25 | select * from {{ref('pages_of_interest')}} 26 | 27 | ), 28 | 29 | parsed as ( 30 | 31 | select *, 32 | 33 | date(datehour) as date_day, 34 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 35 | 36 | from page_views 37 | 38 | ), 39 | 40 | tagged as ( 41 | 42 | select * from parsed 43 | left join pages_of_interest using (title, lang) 44 | 45 | ), 46 | 47 | agg as ( 48 | 49 | select 50 | 51 | date_day, 52 | lang, 53 | title, 54 | subject, 55 | sum(views) as views 56 | 57 | from tagged 58 | group by 1,2,3,4 59 | 60 | ), 61 | 62 | final as ( 63 | 64 | select 65 | 66 | {{ dbt_utils.surrogate_key('date_day', 'lang', 'title') }} as id, 67 | * 68 | 69 | from agg 70 | 71 | ) 72 | 73 | select * from final 74 | -------------------------------------------------------------------------------- /bq-incrementals/models/merge_clustered/mc_aggregated.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'}, 5 | cluster_by = ['id'] 6 | )}} 7 | 8 | with page_views as ( 9 | 10 | select * from {{source('wikipedia', 'pageviews_2020')}} 11 | 12 | {% if is_incremental() %} 13 | -- always rebuild up to the current day 14 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 15 | and date(datehour) < current_date 16 | {% else %} 17 | -- this source table requires a partition filter regardless 18 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 19 | and date(datehour) < current_date 20 | {% endif %} 21 | 22 | ), 23 | 24 | pages_of_interest as ( 25 | 26 | select * from {{ref('pages_of_interest')}} 27 | 28 | ), 29 | 30 | parsed as ( 31 | 32 | select *, 33 | 34 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 35 | 36 | from page_views 37 | 38 | ), 39 | 40 | tagged as ( 41 | 42 | select * from parsed 43 | left join pages_of_interest using (title, lang) 44 | 45 | ), 46 | 47 | agg as ( 48 | 49 | select 50 | 51 | datehour as date_hour, 52 | subject, 53 | lang, 54 | sum(views) as total_views 55 | 56 | from tagged 57 | group by 1,2,3 58 | 59 | ), 60 | 61 | final as ( 62 | 63 | select 64 | 65 | {{ dbt_utils.surrogate_key('date_hour', 'subject', 'lang') }} as id, 66 | * 67 | 68 | from agg 69 | 70 | ) 71 | 72 | select * from final 73 | -------------------------------------------------------------------------------- /bq-incrementals/models/merge_clustered/mc_enriched.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_hour', 'data_type': 'timestamp'}, 5 | cluster_by = ['id'] 6 | )}} 7 | 8 | with page_views as ( 9 | 10 | select * from {{source('wikipedia', 'pageviews_2020')}} 11 | 12 | {% if is_incremental() %} 13 | -- always rebuild up to the current day 14 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 15 | and date(datehour) < current_date 16 | {% else %} 17 | -- this source table requires a partition filter regardless 18 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 19 | and date(datehour) < current_date 20 | {% endif %} 21 | 22 | ), 23 | 24 | pages_of_interest as ( 25 | 26 | select * from {{ref('pages_of_interest')}} 27 | 28 | ), 29 | 30 | parsed as ( 31 | 32 | select *, 33 | 34 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 35 | 36 | from page_views 37 | 38 | ), 39 | 40 | tagged as ( 41 | 42 | select * from parsed 43 | left join pages_of_interest using (title, lang) 44 | 45 | ), 46 | 47 | agg as ( 48 | 49 | select 50 | 51 | datehour as date_hour, 52 | lang, 53 | title, 54 | subject, 55 | sum(views) as views 56 | 57 | from tagged 58 | group by 1,2,3,4 59 | 60 | ), 61 | 62 | final as ( 63 | 64 | select 65 | 66 | {{ dbt_utils.surrogate_key('date_hour', 'lang', 'title') }} as id, 67 | * 68 | 69 | from agg 70 | 71 | ) 72 | 73 | select * from final 74 | -------------------------------------------------------------------------------- /bq-incrementals/models/merge_clustered/mc_goldilocks.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id', 4 | partition_by = {'field': 'date_day', 'data_type': 'date'}, 5 | cluster_by = ['id'] 6 | )}} 7 | 8 | with page_views as ( 9 | 10 | select * from {{source('wikipedia', 'pageviews_2020')}} 11 | 12 | {% if is_incremental() %} 13 | -- always rebuild up to the current day 14 | where date(datehour) >= date_sub(current_date, interval ({{var('new')}}) day) 15 | and date(datehour) < current_date 16 | {% else %} 17 | -- this source table requires a partition filter regardless 18 | where date(datehour) >= date_sub(current_date, interval ({{var('old')}}) day) 19 | and date(datehour) < current_date 20 | {% endif %} 21 | 22 | ), 23 | 24 | pages_of_interest as ( 25 | 26 | select * from {{ref('pages_of_interest')}} 27 | 28 | ), 29 | 30 | parsed as ( 31 | 32 | select *, 33 | 34 | date(datehour) as date_day, 35 | replace(split(wiki, '.')[offset(0)], '-', '_') as lang 36 | 37 | from page_views 38 | 39 | ), 40 | 41 | tagged as ( 42 | 43 | select * from parsed 44 | left join pages_of_interest using (title, lang) 45 | 46 | ), 47 | 48 | agg as ( 49 | 50 | select 51 | 52 | date_day, 53 | lang, 54 | title, 55 | subject, 56 | sum(views) as views 57 | 58 | from tagged 59 | group by 1,2,3,4 60 | 61 | ), 62 | 63 | final as ( 64 | 65 | select 66 | 67 | {{ dbt_utils.surrogate_key('date_day', 'lang', 'title') }} as id, 68 | * 69 | 70 | from agg 71 | 72 | ) 73 | 74 | select * from final 75 | -------------------------------------------------------------------------------- /bq-incrementals/models/pages_of_interest.sql: -------------------------------------------------------------------------------- 1 | with unioned as ( 2 | 3 | {{ dbt_utils.union_relations([ 4 | ref('seed_oss'), 5 | ref('seed_defs_op'), 6 | ref('seed_cov_bond') 7 | ]) }} 8 | 9 | ), 10 | 11 | by_subject as ( 12 | 13 | select *, 14 | 15 | max(case when lang = 'en' then 16 | lower(replace(title, '-', '_')) 17 | end) over (partition by _dbt_source_relation) as subject 18 | 19 | from unioned 20 | 21 | ) 22 | 23 | select * from by_subject 24 | -------------------------------------------------------------------------------- /bq-incrementals/models/wikipedia_source.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | sources: 4 | - name: wikipedia 5 | database: "bigquery-public-data" 6 | tables: 7 | - name: pageviews_2020 8 | -------------------------------------------------------------------------------- /bq-incrementals/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: fishtown-analytics/dbt_utils 3 | version: 0.2.5 4 | -------------------------------------------------------------------------------- /business-hours/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | dbt_packages/ 5 | logs/ 6 | -------------------------------------------------------------------------------- /business-hours/README.md: -------------------------------------------------------------------------------- 1 | Welcome to your new dbt project! 2 | 3 | ### Using the starter project 4 | 5 | Try running the following commands: 6 | - dbt run 7 | - dbt test 8 | 9 | 10 | ### Resources: 11 | - Learn more about dbt [in the docs](https://docs.getdbt.com/docs/introduction) 12 | - Check out [Discourse](https://discourse.getdbt.com/) for commonly asked questions and answers 13 | - Join the [chat](http://slack.getdbt.com/) on Slack for live discussions and support 14 | - Find [dbt events](https://events.getdbt.com) near you 15 | - Check out [the blog](https://blog.getdbt.com/) for the latest news on dbt's development and best practices 16 | -------------------------------------------------------------------------------- /business-hours/analysis/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/business-hours/analysis/.gitkeep -------------------------------------------------------------------------------- /business-hours/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | name: 'business_hours' 3 | version: '1.0.0' 4 | config-version: 2 5 | profile: 'dbt-learn' 6 | 7 | require-dbt-version: [">=1.0.0"] 8 | 9 | model-paths: ["models"] 10 | analysis-paths: ["analysis"] 11 | test-paths: ["tests"] 12 | seed-paths: ["seeds"] 13 | macro-paths: ["macros"] 14 | snapshot-paths: ["snapshots"] 15 | 16 | target-path: "target" 17 | clean-targets: 18 | - "target" 19 | - "dbt_modules" 20 | 21 | # these variables will need to be in H24 format! 22 | vars: 23 | working_hour_start: 8 24 | working_hour_end: 20 25 | 26 | 27 | models: 28 | business_hours: 29 | +materialized: view 30 | 31 | seeds: 32 | business_hours: 33 | sample_tickets: 34 | +column_types: 35 | id: varchar 36 | user_id: varchar 37 | state: varchar 38 | subject: varchar 39 | conversation_created_at_business: timestamp_ntz 40 | first_response_at_business: timestamp_ntz 41 | first_closed_at_business: timestamp_ntz 42 | last_closed_at_business: timestamp_ntz -------------------------------------------------------------------------------- /business-hours/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/business-hours/macros/.gitkeep -------------------------------------------------------------------------------- /business-hours/macros/attempt-1-macros/business_time_functions.sql: -------------------------------------------------------------------------------- 1 | {# 2 | This file creates two macros - one calculates the number of weekdays between two dates, which may be useful for other projects 3 | 4 | The second uses the weekday macro to calculate the number of non-working hours between two dates. 5 | 6 | The third leverages the non-non_business_hours_between macro to generate the duration of business time between two dates 7 | 8 | #} 9 | 10 | {% macro weekdays_between(start_date, end_date) %} 11 | 12 | datediff('day', {{ start_date }}, {{ end_date }} ) - 13 | datediff('week', {{ start_date }}, dateadd('day', 1, {{ end_date }} )) - 14 | datediff('week', {{ start_date }}, {{ end_date }} ) 15 | 16 | {% endmacro %} 17 | 18 | 19 | {# non_business_hours_between: 20 | 21 | Terms in this macro: 22 | - weekdays_between: 23 | returns the number of weekdays between two dates. This is used to evaluate the number of overnights that occur between the two dates. 24 | i.e. Monday to Wednesday is 2 weekdays, ie two weeknights of non-business time. Friday to Monday evaluates to one overnight (8pm-12am Fri + 12am-8am Monday) 25 | we multiply by 12 to convert the weekdays between to hours. 26 | - evaluate weekends: 27 | in order to compare if a weekend falls in between two dates, we can compare the regular datediff to the weekday datediff. 28 | the difference is the number of weekend days (example, Friday to Monday, Datediff = 3, weekday = 1, 3-1 = 2) 29 | muliply the difference by 24 hours per weekend day 30 | 31 | #} 32 | 33 | {% macro non_business_hours_between(start_date, end_date) %} 34 | {% set non_working_hours = (24 - ( var("working_hour_end") - var("working_hour_start") )) %} 35 | 36 | coalesce( 37 | (( {{ weekdays_between(start_date, end_date) }} ) * {{ non_working_hours }} ) 38 | + ((datediff('day', {{ start_date }}, {{ end_date }} ) 39 | - ({{ weekdays_between(start_date, end_date) }}) 40 | ) * 24 )::int, 41 | 0 42 | ) 43 | 44 | {% endmacro %} 45 | 46 | 47 | {# 48 | 49 | business_minutes_between: 50 | This macro leverages the above macros to remove non-business time from the calculation of time durations. 51 | 52 | the basic structure here is: 53 | (date diff in minutes) - (non-business hours * 60) = business minutes 54 | 55 | #} 56 | 57 | 58 | {% macro business_minutes_between__1(start_date, end_date) %} 59 | 60 | datediff('minute', {{ start_date }}, {{ end_date }} ) 61 | - ( {{ non_business_hours_between( start_date, end_date ) }} * 60 ) 62 | 63 | {% endmacro %} 64 | -------------------------------------------------------------------------------- /business-hours/macros/attempt-2-subquery/business_time_functions.sql: -------------------------------------------------------------------------------- 1 | {# 2 | 3 | ### PURPOSE ### 4 | This macro calculates the total working minutes between two timestamps, 5 | meaning, if a ticket begins on a Friday and then carries into Monday, 6 | we do not want to count non-working hours (e.g. Saturdays) towards the 7 | total time to respond/close. 8 | 9 | ## MACROS + INPUTS ## 10 | * Macro 1: working_min_between 11 | This macro takes the two timestamps and finds the total number 12 | of working hours between the timestamps and multiples it by 60 13 | to get the total working minutes between the two timestamps 14 | 15 | Example: 16 | ticket_id = '14025' 17 | first_message_at: 2021-07-09 11:29 18 | first_closed_at: 2021-07-12 14:46 19 | 20 | The total business hours between these two timestamps 21 | (2021-07-09 11:00:00 and 2021-07-12 14:00:00) is 14hr 22 | NOTE: We do not include the 11:00 and 14:00 hours in this 23 | because we will manually calculate the minutes from these 24 | hours in the next macro 25 | 26 | * Macro 2: business_minutes_between 27 | This macro will do two things: 28 | 1. If the working minutes between is 0min, then 29 | just datediff the start and end timestamps to 30 | find the minutes between. 31 | 2. If it's greater than 0, then we want to 32 | add the start_minutes and end_minutes to the 33 | hours between to get the total working minutes. 34 | See example below for a walk-through explanation 35 | Example: 36 | ticket_id = '14025' 37 | first_message_at: 2021-07-09 11:29 38 | first_closed_at: 2021-07-12 14:46 39 | The below macro will take three inputs into consideration: 40 | 1. The total working hours/minutes between the timestmaps 41 | ^in this case -- 14 * 60 = 840min 42 | 2. The minutes from the start timestamp to the next hour 43 | ^in this case -- 2021-07-09 11:29 --> 31min 44 | 3. The minutes from the end timestamp 45 | ^in this case, 46min 46 | We then add these together to see the total working business minutes 47 | between the two timestamps which is 917min 48 | #} 49 | {%- macro working_min_between(start_date, end_date) -%} 50 | ( select 51 | coalesce(count_if(is_business_hour),0) * 60 52 | from {{ ref('all_business_hours') }} 53 | where date_hour > date_trunc('hour', {{ start_date }}) 54 | and date_hour < date_trunc('hour', {{ end_date }}) 55 | ) 56 | {%- endmacro -%} 57 | 58 | {%- macro business_minutes_between__2(start_date, end_date) -%} 59 | coalesce( 60 | case 61 | -- take into account tickets opened and closed in same hour 62 | when (date_trunc('hour', {{ start_date }} ) = date_trunc('hour', {{ end_date }} )) 63 | then datediff('minute', {{ start_date }}, {{ end_date }}) 64 | else {{ working_min_between(start_date, end_date) }} 65 | + (60 - extract(minute from {{ start_date }})) 66 | + (extract(minute from {{ end_date }})) 67 | end, 68 | 0 69 | ) 70 | {%- endmacro -%} -------------------------------------------------------------------------------- /business-hours/models/all_business_hours.sql: -------------------------------------------------------------------------------- 1 | --use macro to create one row per hour per day 2 | with hours as ( 3 | 4 | {{ dbt_utils.date_spine( 5 | datepart="hour", 6 | start_date="to_date('01/01/2017', 'mm/dd/yyyy')", 7 | end_date="dateadd(month, 1, current_date)" 8 | ) 9 | }} 10 | 11 | ), 12 | 13 | /* -- if we had a seed for holidays, include it here 14 | 15 | holidays as ( 16 | 17 | select * from ref('stg_company_holidays') 18 | 19 | ), 20 | 21 | */ 22 | 23 | --convert hour to EST 24 | converted_hours as ( 25 | 26 | select distinct 27 | 28 | convert_timezone( 29 | 'UTC', 30 | date_hour 31 | )::timestamp_ntz as date_hour 32 | 33 | from hours 34 | ), 35 | 36 | 37 | --the output of this CTE is two columns: the first is one row for every hour of 38 | --the day date spine (from above). the second returns the same result if it falls 39 | --within our support hours. in the future, as support potentially changes, 40 | --this is where we will alter biz hours 41 | 42 | business_hours as ( 43 | 44 | select 45 | date_hour, 46 | 47 | case 48 | --before we hired a rep in MST (M-F, 8am - 8pm EST) 49 | when date_hour::date < '2021-09-14' 50 | and dayofweek(date_hour) not in (0,6) 51 | and hour(date_hour) between 8 and 19 52 | -- and holidays.date is null 53 | then converted_hours.date_hour 54 | 55 | -- after we hired international reps (covering Sunday 7pm to Friday 5pm) 56 | when date_hour::date >= '2021-09-14' 57 | and dayofweek(date_hour) = 0 --sundays after 7pm is fair game 58 | and hour(date_hour) between 19 and 23 59 | -- and holidays.date is null 60 | then converted_hours.date_hour 61 | 62 | when date_hour::date >= '2021-09-01' 63 | and dayofweek(date_hour) between 1 and 4 --24/hr coverage M-Thurs 64 | -- and holidays.date is null 65 | then converted_hours.date_hour 66 | 67 | when date_hour::date >= '2021-09-01' 68 | and dayofweek(date_hour) = 5 --fridays, we rest after 8pm ET 69 | and hour(date_hour) between 0 and 19 70 | -- and holidays.date is null 71 | then converted_hours.date_hour 72 | 73 | end as business_hour 74 | 75 | from converted_hours 76 | -- left join holidays 77 | -- on date_trunc(day, date_hour)::date = holidays.date 78 | 79 | ), 80 | 81 | --the output of this CTE adds an additional column to fill in missing values 82 | --the purpose is to show 8am for times outside of business hours 83 | --ex. 10:36pm at night will show 8:00am the next day 84 | corrections as ( 85 | 86 | select 87 | 88 | *, 89 | business_hour is not null as is_business_hour, 90 | lead(business_hour) ignore nulls over ( 91 | partition by 1 92 | order by date_hour 93 | ) as adjusted_business_hour 94 | 95 | from business_hours 96 | 97 | ), 98 | 99 | --this cleans up the extra columns to properly calculate business hours. 100 | --the result is one row for every hour of every day with a mapped business hour. 101 | final as ( 102 | 103 | select 104 | 105 | date_hour, 106 | coalesce(business_hour, adjusted_business_hour) as business_hour, 107 | is_business_hour 108 | 109 | from corrections 110 | 111 | ) 112 | 113 | select * from final -------------------------------------------------------------------------------- /business-hours/models/fct_support_tickets.sql: -------------------------------------------------------------------------------- 1 | with 2 | 3 | tickets as ( 4 | select * from {{ ref('sample_tickets') }} 5 | ), 6 | 7 | final as ( 8 | 9 | select 10 | tickets.*, 11 | 12 | -- macros for attempt 1 -- nested macros 13 | 14 | -- measure weekdays between dates 15 | {{ weekdays_between('conversation_created_at_business', 'first_response_at_business') }} as weekdays_to_first_response, 16 | -- add up overnights + weekends 17 | {{ non_business_hours_between('conversation_created_at_business', 'first_response_at_business') }} as non_working_hours, 18 | -- final calculation 19 | {{ business_minutes_between__1('conversation_created_at_business', 'first_response_at_business') }} as business_minutes__1, 20 | 21 | -- macros for attempt 2 -- subquery 22 | 23 | -- subquery to get working hours 24 | {{ working_min_between('conversation_created_at_business', 'first_response_at_business') }} as working_min_subquery, 25 | 26 | -- adjustments + final calculation 27 | {{ business_minutes_between__2('conversation_created_at_business', 'first_response_at_business') }} as business_minutes__2 28 | 29 | 30 | 31 | from tickets 32 | 33 | ) 34 | 35 | select * from final -------------------------------------------------------------------------------- /business-hours/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_utils 3 | version: 0.8.0 -------------------------------------------------------------------------------- /business-hours/seeds/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/business-hours/seeds/.gitkeep -------------------------------------------------------------------------------- /business-hours/seeds/sample_tickets.csv: -------------------------------------------------------------------------------- 1 | id,user_id,state,subject,conversation_created_at_business,first_response_at_business,first_closed_at_business,last_closed_at_business 2 | 263c7e9022713fa39b62ac583fd26701,1234,closed,Porridge too hot,2021-03-24 19:29:23.000,2021-03-25 08:29:20.000,2021-01-28 08:00:00.000,2021-01-28 14:22:48.000 3 | af23d77bb2d304f5649da474b81fedeb,1234,closed,Porridge too cold,2021-01-04 10:01:34.000,2021-01-05 09:54:29.000,2021-01-07 15:01:43.000,2021-01-07 15:01:43.000 4 | e8b40b51ae64949c2f3bac987840f042,1234,closed,Porridge just right!,2021-01-07 19:54:44.000,2021-01-08 08:58:34.000,2021-01-13 15:14:04.000,2021-01-13 15:14:04.000 5 | 793a0185de6dd851e395743c8c84b939,1234,closed,Bed too firm,2021-01-04 08:00:00.000,2021-01-05 09:53:02.000,2021-01-07 15:01:57.000,2021-01-07 15:01:57.000 6 | f2721518d728f1a359517d1e1d432424,1234,closed,Bed too soft,2021-02-01 08:00:00.000,2021-02-02 08:18:24.000,2021-02-01 08:20:18.000,2021-02-02 08:18:36.000 7 | 8995d903fb57323dbc6821a0aee9cfa4,1234,closed,Bed just right! Can I have it?,2021-01-20 13:03:09.000,2021-01-20 13:29:37.000,2021-01-20 15:00:57.000,2021-03-17 12:44:53.000 8 | aa0dfd6e6b3722d485f29cb09e1a2683,1234,closed,Chair is too big,2021-01-28 17:13:50.000,2021-01-28 17:27:10.000,2021-01-29 11:45:36.000,2021-01-29 11:45:36.000 9 | ee02aaa6e76c087e34211df44b694123,1234,closed,Chair is too small,2021-03-17 09:44:29.000,2021-03-17 09:51:07.000,2021-03-18 16:32:07.000,2021-03-18 16:32:07.000 10 | fc7ee5d4cbb678b7f85de637ff20496d,1234,closed,Chair is just right,2021-03-12 14:34:40.000,2021-03-12 14:57:09.000,2021-03-15 16:03:44.000,2021-03-15 16:03:44.000 11 | 618e94c101c6b0ffd3b0ac887dc9e30b,5432,open,Someone was in my home,2021-01-27 19:50:06.000,2021-01-27 19:51:38.000,, -------------------------------------------------------------------------------- /business-hours/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/business-hours/snapshots/.gitkeep -------------------------------------------------------------------------------- /business-hours/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/business-hours/tests/.gitkeep -------------------------------------------------------------------------------- /dynamic-data-masking-redshift/README.md: -------------------------------------------------------------------------------- 1 | # dynamic-data-masking-redshift 2 | Check out [this discourse article](https://discourse.getdbt.com/t/how-to-implement-dynamic-data-masking-on-redshift/2043) -------------------------------------------------------------------------------- /dynamic-data-masking-redshift/data/employees.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,favorite_bagel_flavor 2 | 1,Tristan,Handy,everything 3 | 2,Drew,Banin,poppy seed 4 | 3,Connor,McArthur,sesame 5 | -------------------------------------------------------------------------------- /dynamic-data-masking-redshift/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'bagel_shop' 2 | version: '0.1.0' 3 | 4 | config-version: 2 5 | profile: bagel_shop 6 | 7 | source-paths: ["models"] 8 | analysis-paths: ["analysis"] 9 | test-paths: ["tests"] 10 | data-paths: ["data"] 11 | macro-paths: ["macros"] 12 | snapshot-paths: ["snapshots"] 13 | 14 | target-path: "target" 15 | clean-targets: 16 | - "target" 17 | - "dbt_modules" 18 | 19 | models: 20 | -------------------------------------------------------------------------------- /dynamic-data-masking-redshift/macros/apply_data_masking.sql: -------------------------------------------------------------------------------- 1 | {% macro mask_column(column_name) %} 2 | -- logic that controls the masking 3 | case 4 | when current_user in ('claire') then {{ column_name }} 5 | else md5({{ column_name }}) 6 | end 7 | {% endmacro %} 8 | 9 | {% macro create_data_masked_view(schema, columns_to_mask) %} 10 | {% if execute %} 11 | 12 | {# get all columns in the relation #} 13 | 14 | {% set model_cols = adapter.get_columns_in_relation(this) %} 15 | 16 | {# create Relation object for masked view #} 17 | 18 | {%- set masked_view = api.Relation.create( 19 | database=this.database, 20 | schema=schema, 21 | identifier=this.identifier) -%} 22 | 23 | {# create schema #} 24 | 25 | {% do adapter.create_schema(masked_view) %} 26 | 27 | {# create masked view in new schema for sensitive columns #} 28 | 29 | {% set view_sql %} 30 | 31 | drop view if exists {{ masked_view }}; 32 | 33 | create view {{ masked_view }} as ( 34 | 35 | select 36 | {% for col in model_cols %} 37 | {% if col.name in columns_to_mask %} 38 | {{ mask_column(col.name) }} as {{ col.name }} 39 | {% else %} 40 | {{ col.name }} 41 | {% endif %} 42 | {{ "," if not loop.last }} 43 | {% endfor %} 44 | from {{ this }} 45 | ) 46 | 47 | {% endset %} 48 | 49 | {% do run_query(view_sql) %} 50 | 51 | {% do dbt_utils.log_info("Masked view created at: " ~ masked_view ) %} 52 | 53 | {% endif %} 54 | 55 | select 1=1 56 | 57 | {% endmacro %} -------------------------------------------------------------------------------- /dynamic-data-masking-redshift/models/employees_with_masking.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | post_hook="{{ create_data_masked_view( 4 | schema='public_analytics', 5 | columns_to_mask=['first_name', 'last_name'] 6 | ) }}" 7 | ) 8 | }} 9 | 10 | select 11 | -- this is the model sql 12 | id, 13 | first_name, 14 | last_name, 15 | favorite_bagel_flavor 16 | from {{ ref('employees') }} 17 | -------------------------------------------------------------------------------- /dynamic-data-masking-redshift/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: fishtown-analytics/dbt_utils 3 | version: 0.6.3 -------------------------------------------------------------------------------- /insert_by_period/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_packages/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /insert_by_period/Makefile: -------------------------------------------------------------------------------- 1 | test-databricks: 2 | dbt deps 3 | dbt seed --target databricks --full-refresh 4 | dbt run -s test_insert_by_period --target databricks --full-refresh 5 | dbt test 6 | dbt run -s test_insert_by_period --target databricks --vars 'test_backfill: True' 7 | dbt test --target databricks 8 | 9 | test-all: test-databricks 10 | echo "Completed successfully" -------------------------------------------------------------------------------- /insert_by_period/README.md: -------------------------------------------------------------------------------- 1 | # Custom insert by period materialization 2 | 3 | `insert_by_period` allows dbt to insert records into a table one period (i.e. day, week) at a time. 4 | 5 | This materialization is appropriate for event data that can be processed in discrete periods. It is similar in concept to the built-in incremental materialization, but has the added benefit of building the model in chunks even during a full-refresh so is particularly useful for models where the initial run can be problematic. 6 | 7 | Should a run of a model using this materialization be interrupted, a subsequent run will continue building the target table from where it was interrupted (granted the `--full-refresh` flag is omitted). 8 | 9 | Progress is logged in the command line for easy monitoring. 10 | 11 | ## Installation 12 | This is not a package on the Package Hub. To install it via git, add this to `packages.yml`: 13 | ```yaml 14 | packages: 15 | - git: https://github.com/dbt-labs/dbt-labs-experimental-features 16 | subdirectory: insert_by_period 17 | revision: XXXX #optional but highly recommended. Provide a full git sha hash, e.g. 7180db61d26836b931aa6ef8ad9d70e7fb3a69fa. If not provided, uses the current HEAD. 18 | 19 | ``` 20 | 21 | ## Usage: 22 | 23 | ```sql 24 | {{ 25 | config( 26 | materialized = "insert_by_period", 27 | period = "day", 28 | timestamp_field = "created_at", 29 | start_date = "2018-01-01", 30 | stop_date = "2018-06-01") 31 | }} 32 | with events as ( 33 | select * 34 | from {{ ref('events') }} 35 | where __PERIOD_FILTER__ -- This will be replaced with a filter in the materialization code 36 | ) 37 | ....complex aggregates here.... 38 | ``` 39 | 40 | **Configuration values:** 41 | 42 | - `period`: period to break the model into, must be a valid [datepart](https://docs.aws.amazon.com/redshift/latest/dg/r_Dateparts_for_datetime_functions.html) (default='Week') 43 | - `timestamp_field`: the column name of the timestamp field that will be used to break the model into smaller queries 44 | - `start_date`: literal date or timestamp - generally choose a date that is earlier than the start of your data 45 | - `stop_date`: literal date or timestamp (default=current_timestamp) 46 | 47 | **Caveats:** 48 | 49 | - This materialization is compatible and tested for a subset of adapters for now: BigQuery, Databricks, PostgreSQL, Redshift and Snowflake. 50 | - This materialization can only be used for a model where records are not expected to change after they are created. 51 | - Any model post-hooks that use `{{ this }}` will fail using this materialization. For example: 52 | 53 | ```yaml 54 | models: 55 | project-name: 56 | post-hook: "grant select on {{ this }} to db_reader" 57 | ``` 58 | 59 | A useful workaround is to change the above post-hook to: 60 | 61 | ```yaml 62 | post-hook: "grant select on {{ this.schema }}.{{ this.name }} to db_reader" 63 | ``` 64 | -------------------------------------------------------------------------------- /insert_by_period/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | # Name your project! Project names should contain only lowercase characters 3 | # and underscores. A good package name should reflect your organization's 4 | # name or the intended use of these models 5 | name: 'insert_by_period' 6 | version: '1.0.0' 7 | config-version: 2 8 | 9 | # This package requires dbt version 1.3.0 or higher to be able to resolve 10 | # dbt.current_timestamp() 11 | require-dbt-version: [">=1.3.0", "<2.0.0"] 12 | 13 | # This setting configures which "profile" dbt uses for this project. 14 | profile: 'insert_by_period' 15 | 16 | dispatch: 17 | - macro_namespace: dbt_utils 18 | search_order: ['spark_utils', 'dbt_utils'] 19 | 20 | # These configurations specify where dbt should look for different types of files. 21 | # The `model-paths` config, for example, states that models in this project can be 22 | # found in the "models/" directory. You probably won't need to change these! 23 | model-paths: ["models"] 24 | analysis-paths: ["analyses"] 25 | test-paths: ["tests"] 26 | seed-paths: ["seeds"] 27 | macro-paths: ["macros"] 28 | snapshot-paths: ["snapshots"] 29 | 30 | target-path: "target" # directory which will store compiled SQL files 31 | clean-targets: # directories to be removed by `dbt clean` 32 | - "target" 33 | - "dbt_packages" 34 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | .env/ 6 | profiles.yml 7 | package-lock.yml -------------------------------------------------------------------------------- /insert_by_period/integration_tests/ci/sample.profiles.yml: -------------------------------------------------------------------------------- 1 | 2 | # HEY! This file is used in the dbt-utils integrations tests with CircleCI. 3 | # You should __NEVER__ check credentials into version control. Thanks for reading :) 4 | 5 | config: 6 | send_anonymous_usage_stats: False 7 | use_colors: True 8 | 9 | integration_tests: 10 | target: postgres 11 | outputs: 12 | postgres: 13 | type: postgres 14 | host: "{{ env_var('POSTGRES_TEST_HOST') }}" 15 | user: "{{ env_var('POSTGRES_TEST_USER') }}" 16 | pass: "{{ env_var('POSTGRES_TEST_PASS') }}" 17 | port: "{{ env_var('POSTGRES_TEST_PORT') | as_number }}" 18 | dbname: "{{ env_var('POSTGRES_TEST_DBNAME') }}" 19 | schema: dbt_utils_integration_tests_postgres 20 | threads: 5 21 | 22 | redshift: 23 | type: redshift 24 | host: "{{ env_var('REDSHIFT_TEST_HOST') }}" 25 | user: "{{ env_var('REDSHIFT_TEST_USER') }}" 26 | pass: "{{ env_var('REDSHIFT_TEST_PASS') }}" 27 | dbname: "{{ env_var('REDSHIFT_TEST_DBNAME') }}" 28 | port: "{{ env_var('REDSHIFT_TEST_PORT') | as_number }}" 29 | schema: dbt_utils_integration_tests_redshift 30 | threads: 5 31 | 32 | bigquery: 33 | type: bigquery 34 | method: service-account 35 | keyfile: "{{ env_var('BIGQUERY_SERVICE_KEY_PATH') }}" 36 | project: "{{ env_var('BIGQUERY_TEST_DATABASE') }}" 37 | schema: dbt_utils_integration_tests_bigquery 38 | threads: 10 39 | 40 | snowflake: 41 | type: snowflake 42 | account: "{{ env_var('SNOWFLAKE_TEST_ACCOUNT') }}" 43 | user: "{{ env_var('SNOWFLAKE_TEST_USER') }}" 44 | password: "{{ env_var('SNOWFLAKE_TEST_PASSWORD') }}" 45 | role: "{{ env_var('SNOWFLAKE_TEST_ROLE') }}" 46 | database: "{{ env_var('SNOWFLAKE_TEST_DATABASE') }}" 47 | warehouse: "{{ env_var('SNOWFLAKE_TEST_WAREHOUSE') }}" 48 | schema: dbt_utils_integration_tests_snowflake 49 | threads: 10 50 | 51 | databricks: 52 | type: databricks 53 | catalog: "{{ env_var('DATABRICKS_TEST_CATALOG') }}" 54 | host: "{{ env_var('DATABRICKS_TEST_HOST') }}" 55 | http_path: "{{ env_var('DATABRICKS_TEST_HTTP_PATH') }}" 56 | token: "{{ env_var('DATABRICKS_TEST_TOKEN') }}" 57 | schema: dbt_utils_integration_tests_databricks 58 | threads: 4 59 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/data/data_insert_by_period.csv: -------------------------------------------------------------------------------- 1 | id,created_at 2 | 1,2017-12-02 3 | 2,2018-01-02 4 | 3,2018-02-02 5 | 4,2018-03-02 6 | 5,2018-04-02 7 | 6,2018-05-02 8 | 7,2018-06-02 9 | 8,2018-07-02 10 | 9,2018-08-02 -------------------------------------------------------------------------------- /insert_by_period/integration_tests/data/data_insert_by_period_overwrite.csv: -------------------------------------------------------------------------------- 1 | id,created_at 2 | 3,2018-02-02 3 | 6,2018-05-02 4 | 5,2018-04-02 5 | 2,2018-01-02 6 | 6,2018-05-02 7 | 4,2018-03-02 8 | 3,2018-02-02 9 | 4,2018-03-02 10 | 5,2018-04-02 11 | 2,2018-01-02 12 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | name: 'insert_by_period_integration_tests' 3 | version: '1.0' 4 | 5 | # require-dbt-version: inherit this from dbt-utils 6 | 7 | config-version: 2 8 | 9 | # This setting configures which "profile" dbt uses for this project. 10 | profile: 'integration_tests' 11 | 12 | dispatch: 13 | - macro_namespace: dbt_utils 14 | search_order: ['spark_utils', 'dbt_utils', 'insert_by_period_integration_tests'] 15 | 16 | 17 | model-paths: ["models"] 18 | analysis-paths: ["analysis"] 19 | test-paths: ["tests"] 20 | seed-paths: ["data"] 21 | macro-paths: ["macros"] 22 | 23 | target-path: "target" # directory which will store compiled SQL files 24 | clean-targets: # directories to be removed by `dbt clean` 25 | - "target" 26 | - "dbt_modules" 27 | - "dbt_packages" 28 | 29 | seeds: 30 | +quote_columns: false -------------------------------------------------------------------------------- /insert_by_period/integration_tests/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/insert_by_period/integration_tests/macros/.gitkeep -------------------------------------------------------------------------------- /insert_by_period/integration_tests/macros/assert_equal_values.sql: -------------------------------------------------------------------------------- 1 | {% macro assert_equal_values(actual_object, expected_object) %} 2 | {% if not execute %} 3 | 4 | {# pass #} 5 | 6 | {% elif actual_object != expected_object %} 7 | 8 | {% set msg %} 9 | Expected did not match actual 10 | 11 | ----------- 12 | Actual: 13 | ----------- 14 | --->{{ actual_object }}<--- 15 | 16 | ----------- 17 | Expected: 18 | ----------- 19 | --->{{ expected_object }}<--- 20 | 21 | {% endset %} 22 | 23 | {{ log(msg, info=True) }} 24 | 25 | select 'fail' 26 | 27 | {% else %} 28 | 29 | select 'ok' {{ limit_zero() }} 30 | 31 | {% endif %} 32 | {% endmacro %} -------------------------------------------------------------------------------- /insert_by_period/integration_tests/macros/limit_zero.sql: -------------------------------------------------------------------------------- 1 | {% macro my_custom_macro() %} 2 | whatever 3 | {% endmacro %} 4 | 5 | {% macro limit_zero() %} 6 | {{ return(adapter.dispatch('limit_zero', 'dbt_utils')()) }} 7 | {% endmacro %} 8 | 9 | {% macro default__limit_zero() %} 10 | {{ return('limit 0') }} 11 | {% endmacro %} -------------------------------------------------------------------------------- /insert_by_period/integration_tests/macros/tests.sql: -------------------------------------------------------------------------------- 1 | 2 | {% test assert_equal(model, actual, expected) %} 3 | select * from {{ model }} where {{ actual }} != {{ expected }} 4 | 5 | {% endtest %} 6 | 7 | 8 | {% test not_empty_string(model, column_name) %} 9 | 10 | select * from {{ model }} where {{ column_name }} = '' 11 | 12 | {% endtest %} 13 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/models/expected_insert_by_period.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'view', 4 | enabled=(project_name == 'insert_by_period_integration_tests'), 5 | ) 6 | }} 7 | 8 | select * 9 | from {{ ref('data_insert_by_period') }} 10 | where id in (2, 3, 4, 5, 6) 11 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/models/expected_insert_by_period_overwrite.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'view', 4 | enabled=(project_name == 'insert_by_period_integration_tests'), 5 | ) 6 | }} 7 | 8 | select * 9 | from {{ ref('data_insert_by_period_overwrite') }} 10 | where id in (2, 3, 4, 5, 6) 11 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: test_insert_by_period 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected_insert_by_period') 8 | enabled: "{{not var('test_backfill', False)}}" 9 | - dbt_utils.equality: 10 | compare_model: ref('expected_insert_by_period_overwrite') 11 | enabled: "{{var('test_backfill', False)}}" 12 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/models/test_insert_by_period.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'insert_by_period', 4 | period = 'month', 5 | timestamp_field = 'cast(created_at as timestamp)', 6 | start_date = '2018-01-01', 7 | stop_date = '2018-03-01', 8 | backfill = var('test_backfill', False), 9 | enabled=(project_name == 'insert_by_period_integration_tests'), 10 | ) 11 | }} 12 | 13 | with events as ( 14 | select * 15 | from {{ ref('data_insert_by_period') }} 16 | where __PERIOD_FILTER__ 17 | ) 18 | 19 | select * from events 20 | -------------------------------------------------------------------------------- /insert_by_period/integration_tests/packages.yml: -------------------------------------------------------------------------------- 1 | 2 | packages: 3 | - local: ../ 4 | - package: dbt-labs/dbt_utils 5 | version: [">0.9.0", "<2.0.0"] 6 | - package: dbt-labs/spark_utils 7 | version: 0.3.0 8 | -------------------------------------------------------------------------------- /insert_by_period/macros/create_relation_for_insert_by_period.sql: -------------------------------------------------------------------------------- 1 | {% macro create_relation_for_insert_by_period(tmp_identifier, schema, type) -%} 2 | {{ return(adapter.dispatch('create_relation_for_insert_by_period', 'insert_by_period')(tmp_identifier, schema, type)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__create_relation_for_insert_by_period(tmp_identifier, schema, type) -%} 6 | {% do return (api.Relation.create(identifier=tmp_identifier, 7 | schema=schema, type=type)) %} 8 | {%- endmacro %} 9 | 10 | {% macro postgres__create_relation_for_insert_by_period(tmp_identifier, schema, type) -%} 11 | {% do return (api.Relation.create(identifier=tmp_identifier, 12 | schema=None, type=type)) %} 13 | {%- endmacro %} 14 | 15 | {% macro databricks__create_relation_for_insert_by_period(tmp_identifier, schema, type) -%} 16 | {% do return (api.Relation.create(identifier=tmp_identifier, 17 | schema=None, type=type)) %} 18 | {%- endmacro %} 19 | -------------------------------------------------------------------------------- /insert_by_period/macros/get_period_boundaries.sql: -------------------------------------------------------------------------------- 1 | {% macro get_period_boundaries(target_schema, target_table, timestamp_field, start_date, stop_date, period, backfill, full_refresh_mode) -%} 2 | {{ return(adapter.dispatch('get_period_boundaries', 'insert_by_period')(target_schema, target_table, timestamp_field, start_date, stop_date, period, backfill, full_refresh_mode)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__get_period_boundaries(target_schema, target_table, timestamp_field, start_date, stop_date, period, backfill, full_refresh_mode) -%} 6 | 7 | {% call statement('period_boundaries', fetch_result=True) -%} 8 | with data as ( 9 | select 10 | {% if backfill and not full_refresh_mode -%} 11 | cast('{{start_date}}' as timestamp) as start_timestamp, 12 | {%- else -%} 13 | coalesce(max({{timestamp_field}}), cast('{{start_date}}' as timestamp)) as start_timestamp, 14 | {%- endif %} 15 | coalesce( 16 | {{ dateadd('millisecond', 17 | -1, 18 | "cast(nullif('" ~ stop_date ~ "','') as timestamp)") }}, 19 | {{ dbt.current_timestamp() }} 20 | ) as stop_timestamp 21 | from {{adapter.quote(target_schema)}}.{{adapter.quote(target_table)}} 22 | ) 23 | 24 | select 25 | start_timestamp, 26 | stop_timestamp, 27 | {{ datediff('start_timestamp', 28 | 'stop_timestamp', 29 | period) }} + 1 as num_periods 30 | from data 31 | {%- endcall %} 32 | 33 | {%- endmacro %} 34 | 35 | 36 | {% macro bigquery__get_period_boundaries(target_schema, target_table, timestamp_field, start_date, stop_date, period, backfill, full_refresh_mode) -%} 37 | 38 | {% call statement('period_boundaries', fetch_result=True) -%} 39 | with data as ( 40 | select 41 | {% if backfill and not full_refresh_mode -%} 42 | cast('{{start_date}}' as timestamp) as start_timestamp, 43 | {%- else -%} 44 | coalesce(max({{timestamp_field}}), cast('{{start_date}}' as timestamp)) as start_timestamp, 45 | {%- endif %} 46 | coalesce(datetime_add(cast(nullif('{{stop_date}}','') as timestamp), interval -1 millisecond), {{dbt.current_timestamp()}}) as stop_timestamp 47 | from {{adapter.quote(target_schema)}}.{{adapter.quote(target_table)}} 48 | ) 49 | 50 | select 51 | start_timestamp, 52 | stop_timestamp, 53 | {{ datediff('start_timestamp', 54 | 'stop_timestamp', 55 | period) }} + 1 as num_periods 56 | from data 57 | {%- endcall %} 58 | 59 | {%- endmacro %} -------------------------------------------------------------------------------- /insert_by_period/macros/get_period_sql.sql: -------------------------------------------------------------------------------- 1 | {% macro get_period_sql(target_cols_csv, sql, timestamp_field, period, start_timestamp, stop_timestamp, offset) -%} 2 | {{ return(adapter.dispatch('get_period_sql', 'insert_by_period')(target_cols_csv, sql, timestamp_field, period, start_timestamp, stop_timestamp, offset)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__get_period_sql(target_cols_csv, sql, timestamp_field, period, start_timestamp, stop_timestamp, offset) -%} 6 | 7 | {%- set period_filter -%} 8 | ({{timestamp_field}} > '{{start_timestamp}}'::timestamp + interval '{{offset}} {{period}}' and 9 | {{timestamp_field}} <= '{{start_timestamp}}'::timestamp + interval '{{offset}} {{period}}' + interval '1 {{period}}' and 10 | {{timestamp_field}} < '{{stop_timestamp}}'::timestamp) 11 | {%- endset -%} 12 | 13 | {%- set filtered_sql = sql | replace("__PERIOD_FILTER__", period_filter) -%} 14 | 15 | select 16 | {{target_cols_csv}} 17 | from ( 18 | {{filtered_sql}} 19 | ) target_cols 20 | 21 | {%- endmacro %} 22 | 23 | 24 | {% macro bigquery__get_period_sql(target_cols_csv, sql, timestamp_field, period, start_timestamp, stop_timestamp, offset) -%} 25 | 26 | {%- set period_filter -%} 27 | ({{timestamp_field}} > cast(cast(timestamp('{{start_timestamp}}') as datetime) + interval {{offset}} {{period}} as timestamp) and 28 | {{timestamp_field}} <= cast(cast(timestamp('{{start_timestamp}}') as datetime) + interval {{offset}} {{period}} + interval 1 {{period}} as timestamp) and 29 | {{timestamp_field}} < cast('{{stop_timestamp}}' as timestamp)) 30 | {%- endset -%} 31 | 32 | {%- set filtered_sql = sql | replace("__PERIOD_FILTER__", period_filter) -%} 33 | 34 | select 35 | {{target_cols_csv}} 36 | from ( 37 | {{filtered_sql}} 38 | ) target_cols 39 | 40 | {%- endmacro %} -------------------------------------------------------------------------------- /insert_by_period/macros/get_rows_inserted.sql: -------------------------------------------------------------------------------- 1 | {% macro get_rows_inserted(result) -%} 2 | {{ return(adapter.dispatch('get_rows_inserted', 'insert_by_period')(result)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__get_rows_inserted(result) %} 6 | 7 | {% if 'response' in result.keys() %} {# added in v0.19.0 #} 8 | {% set rows_inserted = result['response']['rows_affected'] %} 9 | {% else %} {# older versions #} 10 | {% set rows_inserted = result['status'].split(" ")[2] | int %} 11 | {% endif %} 12 | 13 | {{return(rows_inserted)}} 14 | 15 | {% endmacro %} 16 | 17 | {% macro databricks__get_rows_inserted(result) %} 18 | 19 | {% if 'data' in result.keys() %} 20 | {% set rows_inserted = result['data'][0][0] | int %} 21 | {% endif %} 22 | 23 | {{return(rows_inserted)}} 24 | 25 | {% endmacro %} -------------------------------------------------------------------------------- /insert_by_period/macros/insert_by_period_materialization.sql: -------------------------------------------------------------------------------- 1 | {% materialization insert_by_period, default -%} 2 | {%- set timestamp_field = config.require('timestamp_field') -%} 3 | {%- set start_date = config.require('start_date') -%} 4 | {%- set stop_date = config.get('stop_date') or '' -%} 5 | {%- set period = config.get('period') or 'week' -%} 6 | {%- set backfill = config.get('backfill') or False -%} 7 | 8 | {%- if sql.find('__PERIOD_FILTER__') == -1 -%} 9 | {%- set error_message -%} 10 | Model '{{ model.unique_id }}' does not include the required string '__PERIOD_FILTER__' in its sql 11 | {%- endset -%} 12 | {{ exceptions.raise_compiler_error(error_message) }} 13 | {%- endif -%} 14 | 15 | {%- set identifier = model['name'] -%} 16 | 17 | {%- set old_relation = adapter.get_relation(database=database, schema=schema, identifier=identifier) -%} 18 | {%- set target_relation = api.Relation.create(identifier=identifier, schema=schema, type='table') -%} 19 | 20 | {%- set non_destructive_mode = (flags.NON_DESTRUCTIVE == True) -%} 21 | {%- set full_refresh_mode = (flags.FULL_REFRESH == True) -%} 22 | 23 | {%- set exists_as_table = (old_relation is not none and old_relation.is_table) -%} 24 | {%- set exists_not_as_table = (old_relation is not none and not old_relation.is_table) -%} 25 | 26 | {%- set should_truncate = (non_destructive_mode and full_refresh_mode and exists_as_table) -%} 27 | {%- set should_drop = (not should_truncate and (full_refresh_mode or exists_not_as_table)) -%} 28 | {%- set force_create = (flags.FULL_REFRESH and not flags.NON_DESTRUCTIVE) -%} 29 | 30 | -- setup 31 | {% if old_relation is none -%} 32 | -- noop 33 | {%- elif should_truncate -%} 34 | {{adapter.truncate_relation(old_relation)}} 35 | {%- elif should_drop -%} 36 | {{adapter.drop_relation(old_relation)}} 37 | {%- set old_relation = none -%} 38 | {%- endif %} 39 | 40 | {{ run_hooks(pre_hooks, inside_transaction=False) }} 41 | 42 | -- `BEGIN` happens here: 43 | {{ run_hooks(pre_hooks, inside_transaction=True) }} 44 | 45 | -- build model 46 | {% if force_create or old_relation is none -%} 47 | {# Create an empty target table -#} 48 | {% call statement('main') -%} 49 | {%- set empty_sql = sql | replace("__PERIOD_FILTER__", 'false') -%} 50 | {{create_table_as(False, target_relation, empty_sql)}} 51 | {%- endcall %} 52 | {%- endif %} 53 | 54 | {% set period_boundaries = insert_by_period.get_period_boundaries( 55 | schema, 56 | identifier, 57 | timestamp_field, 58 | start_date, 59 | stop_date, 60 | period, 61 | backfill, 62 | full_refresh_mode, 63 | ) %} 64 | {% set period_boundaries_results = load_result('period_boundaries')['data'][0] %} 65 | {%- set start_timestamp = period_boundaries_results[0] | string -%} 66 | {%- set stop_timestamp = period_boundaries_results[1] | string -%} 67 | {%- set num_periods = period_boundaries_results[2] | int -%} 68 | 69 | {% set target_columns = adapter.get_columns_in_relation(target_relation) %} 70 | {%- set target_cols_csv = target_columns | map(attribute='quoted') | join(', ') -%} 71 | {%- set loop_vars = {'sum_rows_inserted': 0} -%} 72 | 73 | -- commit each period as a separate transaction 74 | {% for i in range(num_periods) -%} 75 | {%- set msg = "Running for " ~ period ~ " " ~ (i + 1) ~ " of " ~ (num_periods) -%} 76 | {{ print(msg) }} 77 | 78 | {%- set tmp_identifier = model['name'] ~ '__dbt_incremental_period' ~ i ~ '_tmp' -%} 79 | {%- set tmp_relation = insert_by_period.create_relation_for_insert_by_period(tmp_identifier, schema, 'table') -%} 80 | {% call statement() -%} 81 | {% set tmp_table_sql = insert_by_period.get_period_sql(target_cols_csv, 82 | sql, 83 | timestamp_field, 84 | period, 85 | start_timestamp, 86 | stop_timestamp, 87 | i) %} 88 | {{dbt.create_table_as(True, tmp_relation, tmp_table_sql)}} 89 | {%- endcall %} 90 | 91 | {{adapter.expand_target_column_types(from_relation=tmp_relation, 92 | to_relation=target_relation)}} 93 | {%- set name = 'main-' ~ i -%} 94 | {% call statement(name, fetch_result=True) -%} 95 | insert into {{target_relation}} ({{target_cols_csv}}) 96 | ( 97 | select 98 | {{target_cols_csv}} 99 | from {{tmp_relation.include(schema=True)}} 100 | ); 101 | {%- endcall %} 102 | {% set result = load_result('main-' ~ i) %} 103 | 104 | {% set rows_inserted = insert_by_period.get_rows_inserted(result) %} 105 | 106 | {%- set sum_rows_inserted = loop_vars['sum_rows_inserted'] + rows_inserted -%} 107 | {%- if loop_vars.update({'sum_rows_inserted': sum_rows_inserted}) %} {% endif -%} 108 | 109 | {%- set msg = "Ran for " ~ period ~ " " ~ (i + 1) ~ " of " ~ (num_periods) ~ "; " ~ rows_inserted ~ " record(s) inserted" -%} 110 | {{ print(msg) }} 111 | 112 | {%- endfor %} 113 | 114 | -- from the table mat 115 | {% do create_indexes(target_relation) %} 116 | 117 | {{ run_hooks(post_hooks, inside_transaction=True) }} 118 | 119 | {% set should_revoke = should_revoke(existing_relation, full_refresh_mode=True) %} 120 | {% do apply_grants(target_relation, grant_config, should_revoke=should_revoke) %} 121 | 122 | {% do persist_docs(target_relation, model) %} 123 | 124 | -- `COMMIT` happens here 125 | {{ adapter.commit() }} 126 | 127 | {{ run_hooks(post_hooks, inside_transaction=False) }} 128 | -- end from the table mat 129 | 130 | {%- set status_string = "INSERT " ~ loop_vars['sum_rows_inserted'] -%} 131 | 132 | {% call noop_statement('main', status_string) -%} 133 | -- no-op 134 | {%- endcall %} 135 | 136 | -- Return the relations created in this materialization 137 | {{ return({'relations': [target_relation]}) }} 138 | 139 | {%- endmaterialization %} -------------------------------------------------------------------------------- /lambda-views/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /lambda-views/README.md: -------------------------------------------------------------------------------- 1 | # Lambda views 2 | 3 | ## Option 1: 4 | Implement this without any macros. 5 | 6 | ![Option 1 DAG](etc/option-1-dag.png) 7 | 8 | 9 | Things to note: 10 | - Use of the `run_started_at` [variable](https://docs.getdbt.com/reference/dbt-jinja-functions/run_started_at/) 11 | - We've added some meta fields to make debugging easier 12 | 13 | Pros: 14 | - Relatively easy to intuit what's going on 15 | 16 | Cons: 17 | - SQL is re-used — two models have the transformation SQL (e.g. `page_views_current` and `page_views_historical`), and the SQL in the models that union together the two relations are very similar 18 | - Very brittle — have to remember to materialize each model appropriately 19 | 20 | ## Option 2 21 | Use macros to reduce duplicated code: 22 | - Use a macro, `_sql.sql` for the transformation SQL 23 | - Use macros, `lambda_filter` and `lambda_union` to template the `where` clauses and the `union` model 24 | 25 | ![Option 2 DAG](etc/option-2-dag.png) 26 | 27 | Things to note: 28 | - Removed the `__lambda_current` views, since you don't strictly need to materialize those in your warehouse 29 | - Optional var, `lambda_split`, that can be overridden for the cutoff time 30 | - Added logic for a unique key (though that may have performance impacts) 31 | - The `lambda_filter` macro relies on the model having a matching column in both the source and target table: 32 | ```sql 33 | where {{ column_name }} >= (select max({{ column_name }}) from {{ this }}) 34 | and {{ column_name }} < '{{ filter_time }}' 35 | ``` 36 | 37 | Pros: 38 | - Less duplicated code 39 | - Less chance of silly mistakes 40 | - Fewer objects materialized in the warehouse 41 | 42 | Cons: 43 | - Harder to reason about — the model code lives separately to the models 44 | 45 | 46 | ## Thought experiment 47 | Use a custom materialization 48 | 49 | **Note: This doesn't actually work** 50 | ## Alt-alt approach: custom materialization?? 51 | 52 | I've included a mockup in `models/thought_experiment_only`. As the name suggests, this is only a thought experiment. 53 | 54 | **Pros:** 55 | * We can _both_ keep model SQL within the model file _and_ write that SQL only once 56 | 57 | **Cons:** 58 | * It obfuscates a _lot_ of logic into the materialization layer 59 | 60 | **Challenges:** 61 | * How to pass `config` values down to the historical model? Namely `materialization` (table or incremental), `schema`, `alias` 62 | * How to "call" one materialization from another? We don't want to copy+paste all the logic from every adapter's `incremental` materialization into a new `lambda_view` materialization 63 | * Will dbt break in new and interesting ways if it creates multiple objects in a database for one model? 64 | -------------------------------------------------------------------------------- /lambda-views/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/lambda-views/data/.gitkeep -------------------------------------------------------------------------------- /lambda-views/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'lambda_views' 2 | version: '0.1.0' 3 | config-version: 2 4 | 5 | source-paths: ["models"] 6 | analysis-paths: ["analysis"] 7 | test-paths: ["tests"] 8 | data-paths: ["data"] 9 | macro-paths: ["macros"] 10 | snapshot-paths: ["snapshots"] 11 | 12 | target-path: "target" 13 | clean-targets: 14 | - "target" 15 | - "dbt_modules" 16 | 17 | models: 18 | lambda_views: 19 | option_1: 20 | enabled: false 21 | option_2: 22 | enabled: true 23 | thought_experiment: 24 | enabled: false 25 | -------------------------------------------------------------------------------- /lambda-views/etc/option-1-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/lambda-views/etc/option-1-dag.png -------------------------------------------------------------------------------- /lambda-views/etc/option-2-dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/lambda-views/etc/option-2-dag.png -------------------------------------------------------------------------------- /lambda-views/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/lambda-views/macros/.gitkeep -------------------------------------------------------------------------------- /lambda-views/macros/lambda/lambda_filter.sql: -------------------------------------------------------------------------------- 1 | {% macro lambda_filter(column_name) %} 2 | 3 | {% set materialized = config.require('materialized') %} 4 | {% set filter_time = var('lambda_split', run_started_at) %} 5 | 6 | {% if materialized == 'view' %} 7 | 8 | where {{ column_name }} >= '{{ filter_time }}' 9 | 10 | {% elif is_incremental() %} 11 | 12 | where {{ column_name }} >= (select max({{ column_name }}) from {{ this }}) 13 | and {{ column_name }} < '{{ filter_time }}' 14 | 15 | {% else %} 16 | 17 | where {{ column_name }} < '{{ filter_time }}' 18 | 19 | {% endif %} 20 | 21 | {% endmacro %} 22 | -------------------------------------------------------------------------------- /lambda-views/macros/lambda/lambda_union.sql: -------------------------------------------------------------------------------- 1 | {% macro lambda_union(historical_relation, model_sql) %} 2 | 3 | {% set unique_key = config.get('unique_key', none) %} 4 | 5 | with historical as ( 6 | 7 | select *, 8 | 'historical' as _dbt_lambda_view_source, 9 | '{{ run_started_at }}' as _dbt_last_run_at 10 | 11 | from {{ historical_relation }} 12 | 13 | ), 14 | 15 | new_raw as ( 16 | 17 | {{ model_sql }} 18 | 19 | ), 20 | 21 | new as ( 22 | 23 | select *, 24 | 'new' as _dbt_lambda_view_source, 25 | '{{ run_started_at }}' as _dbt_last_run_at 26 | 27 | from new_raw 28 | 29 | ), 30 | 31 | unioned as ( 32 | 33 | select * from historical 34 | 35 | {% if unique_key %} 36 | 37 | where {{ unique_key }} not in ( 38 | select {{ unique_key }} from new 39 | ) 40 | 41 | {% endif %} 42 | 43 | union all 44 | 45 | select * from new 46 | 47 | ) 48 | 49 | select * from unioned 50 | 51 | {% endmacro %} 52 | -------------------------------------------------------------------------------- /lambda-views/macros/models/page_views_model_sql.sql: -------------------------------------------------------------------------------- 1 | {% macro page_views_model_sql() %} 2 | 3 | with events as ( 4 | 5 | select * from {{ source('snowplow','event') }} 6 | 7 | {{ lambda_filter(column_name = 'collector_tstamp') }} 8 | 9 | ), 10 | 11 | page_views as ( 12 | 13 | select 14 | domain_sessionid as session_id, 15 | domain_userid as anonymous_user_id, 16 | web_page_context.value:data.id::varchar as page_view_id, 17 | page_url, 18 | count(*) * 10 as approx_time_on_page, 19 | min(derived_tstamp) as page_view_start, 20 | max(collector_tstamp) as collector_tstamp 21 | 22 | from events, 23 | lateral flatten (input => parse_json(contexts):data) web_page_context 24 | 25 | group by 1,2,3,4 26 | 27 | ) 28 | 29 | select * from page_views 30 | 31 | {% endmacro %} 32 | -------------------------------------------------------------------------------- /lambda-views/macros/models/sessions_model_sql.sql: -------------------------------------------------------------------------------- 1 | {% macro sessions_model_sql() %} 2 | 3 | with page_views as ( 4 | 5 | select * from {{ ref('page_views') }} 6 | 7 | {{ lambda_filter(column_name = 'collector_tstamp') }} 8 | 9 | ), 10 | 11 | sessions as ( 12 | 13 | select 14 | session_id, 15 | anonymous_user_id, 16 | 17 | count(*) as page_views, 18 | sum(approx_time_on_page) as total_time, 19 | min(page_view_start) as session_start, 20 | max(collector_tstamp) as collector_tstamp 21 | 22 | from page_views 23 | 24 | group by 1,2 25 | 26 | ) 27 | 28 | select * from sessions 29 | 30 | {% endmacro %} 31 | -------------------------------------------------------------------------------- /lambda-views/models/option_1/page_views.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | with historical as ( 8 | 9 | select 10 | *, 11 | 'historical' as _dbt_lambda_view_source, 12 | '{{ run_started_at }}' as _dbt_last_run_at 13 | 14 | from {{ ref('page_views__lambda_historical') }} 15 | 16 | where collector_tstamp < '{{ run_started_at }}' 17 | 18 | ), 19 | 20 | new as ( 21 | 22 | select 23 | *, 24 | 'new' as _dbt_lambda_view_source, 25 | '{{ run_started_at }}' as _dbt_last_run_at 26 | 27 | from {{ ref('page_views__lambda_current') }} 28 | 29 | where collector_tstamp >= '{{ run_started_at }}' 30 | 31 | ), 32 | 33 | 34 | unioned as ( 35 | 36 | select * from current_view 37 | 38 | union all 39 | 40 | select * from historical_table 41 | 42 | ) 43 | 44 | select * from unioned 45 | -------------------------------------------------------------------------------- /lambda-views/models/option_1/page_views__lambda_current.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | with events as ( 8 | 9 | select * from {{ source('snowplow','event') }} 10 | where collector_tstamp >= '{{ run_started_at }}' 11 | 12 | ), 13 | 14 | page_views as ( 15 | 16 | select 17 | domain_sessionid as session_id, 18 | domain_userid as anonymous_user_id, 19 | web_page_context.value:data.id::varchar as page_view_id, 20 | page_url, 21 | count(*) * 10 as approx_time_on_page, 22 | min(derived_tstamp) as page_view_start, 23 | max(collector_tstamp) as collector_tstamp 24 | 25 | from events, 26 | lateral flatten (input => parse_json(contexts):data) web_page_context 27 | 28 | group by 1,2,3,4 29 | 30 | ) 31 | 32 | select * from page_views 33 | -------------------------------------------------------------------------------- /lambda-views/models/option_1/page_views__lambda_historical.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key = 'page_view_id' 5 | ) 6 | }} 7 | 8 | with events as ( 9 | 10 | select * from {{ source('snowplow','event') }} 11 | {% if is_incremental() %} 12 | where collector_tstamp >= (select max(collector_tstamp) from {{ this }}) 13 | {% endif %} 14 | 15 | ), 16 | 17 | page_views as ( 18 | 19 | select 20 | domain_sessionid as session_id, 21 | domain_userid as anonymous_user_id, 22 | web_page_context.value:data.id::varchar as page_view_id, 23 | page_url, 24 | count(*) * 10 as approx_time_on_page, 25 | min(derived_tstamp) as page_view_start, 26 | max(collector_tstamp) as collector_tstamp 27 | 28 | from events, 29 | lateral flatten (input => parse_json(contexts):data) web_page_context 30 | 31 | group by 1,2,3,4 32 | 33 | ) 34 | 35 | select * from page_views 36 | -------------------------------------------------------------------------------- /lambda-views/models/option_1/sessions.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | with historical as ( 8 | 9 | select 10 | *, 11 | 'historical' as _dbt_lambda_view_source, 12 | '{{ run_started_at }}' as _dbt_last_run_at 13 | 14 | from {{ ref('sessions__lambda_historical') }} 15 | 16 | where collector_tstamp < '{{ run_started_at }}' 17 | 18 | ), 19 | 20 | new as ( 21 | 22 | select 23 | *, 24 | 'new' as _dbt_lambda_view_source, 25 | '{{ run_started_at }}' as _dbt_last_run_at 26 | 27 | from {{ ref('sessions__lambda_current') }} 28 | 29 | where collector_tstamp >= '{{ run_started_at }}' 30 | 31 | ), 32 | 33 | unioned as ( 34 | 35 | select * from current_view 36 | 37 | union all 38 | 39 | select * from historical_table 40 | 41 | ) 42 | 43 | select * from unioned 44 | -------------------------------------------------------------------------------- /lambda-views/models/option_1/sessions__lambda_current.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='view' 4 | ) 5 | }} 6 | 7 | with page_views as ( 8 | 9 | select * from {{ ref('page_views') }} 10 | 11 | where collector_tstamp >= '{{ run_started_at }}' 12 | 13 | ), 14 | 15 | sessions as ( 16 | 17 | select 18 | session_id, 19 | anonymous_user_id, 20 | 21 | count(*) as page_views, 22 | sum(approx_time_on_page) as total_time, 23 | min(page_view_start) as session_start, 24 | max(collector_tstamp) as collector_tstamp 25 | 26 | from page_views 27 | 28 | group by 1,2 29 | 30 | ) 31 | 32 | select * from sessions 33 | -------------------------------------------------------------------------------- /lambda-views/models/option_1/sessions__lambda_historical.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized='incremental', 4 | unique_key = 'session_id' 5 | ) 6 | }} 7 | 8 | with page_views as ( 9 | 10 | select * from {{ ref('page_views') }} 11 | 12 | {% if is_incremental() %} 13 | where collector_tstamp >= (select max(collector_tstamp) from {{ this }}) 14 | {% endif %} 15 | 16 | ), 17 | 18 | sessions as ( 19 | 20 | select 21 | session_id, 22 | anonymous_user_id, 23 | 24 | count(*) as page_views, 25 | sum(approx_time_on_page) as total_time, 26 | min(page_view_start) as session_start, 27 | max(collector_tstamp) as collector_tstamp 28 | 29 | from page_views 30 | 31 | group by 1,2 32 | 33 | ) 34 | 35 | select * from sessions 36 | -------------------------------------------------------------------------------- /lambda-views/models/option_2/page_views.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'view', 4 | unique_key = 'page_view_id' 5 | ) 6 | }} 7 | 8 | {{ lambda_union( 9 | historical_relation = ref(this.name ~ '__lambda_historical'), 10 | model_sql = page_views_model_sql() 11 | ) }} 12 | -------------------------------------------------------------------------------- /lambda-views/models/option_2/page_views__lambda_historical.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'delete+insert', 5 | unique_key = 'page_view_id', 6 | schema = 'lambda_historical', 7 | alias = 'page_views' 8 | ) 9 | }} 10 | 11 | {{ page_views_model_sql() }} 12 | -------------------------------------------------------------------------------- /lambda-views/models/option_2/sessions.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'view', 4 | unique_key = 'session_id' 5 | ) 6 | }} 7 | 8 | {{ lambda_union( 9 | historical_relation = ref(this.name ~ '__lambda_historical'), 10 | model_sql = sessions_model_sql() 11 | ) }} 12 | -------------------------------------------------------------------------------- /lambda-views/models/option_2/sessions__lambda_historical.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'incremental', 4 | incremental_strategy = 'delete+insert', 5 | unique_key = 'session_id', 6 | schema = 'lambda_historical', 7 | alias = 'sessions' 8 | ) 9 | }} 10 | 11 | {{ sessions_model_sql() }} 12 | -------------------------------------------------------------------------------- /lambda-views/models/sources.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | sources: 3 | - name: snowplow 4 | database: raw 5 | loaded_at_field: collector_tstamp 6 | freshness: 7 | error_after: {count: 1, period: hour} 8 | tables: 9 | - name: event 10 | -------------------------------------------------------------------------------- /lambda-views/models/thought_experiment/page_views.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'lambda_view', 4 | unique_key = 'page_view_id', 5 | historical_config = { 6 | 'materialized': 'incremental', 7 | 'schema': 'lambda_historical', 8 | 'alias': 'page_views' 9 | } 10 | ) 11 | }} 12 | 13 | with events as ( 14 | 15 | select * from {{ source('snowplow','event') }} 16 | 17 | {{ lambda_filter('collector_tstamp') }} 18 | 19 | ), 20 | 21 | page_views as ( 22 | 23 | select 24 | domain_sessionid as session_id, 25 | domain_userid as anonymous_user_id, 26 | web_page_context.value:data.id::varchar as page_view_id, 27 | page_url, 28 | count(*) * 10 as approx_time_on_page, 29 | min(derived_tstamp) as page_view_start, 30 | max(collector_tstamp) as collector_tstamp 31 | 32 | from events, 33 | lateral flatten (input => parse_json(contexts):data) web_page_context 34 | 35 | group by 1,2,3,4 36 | 37 | ) 38 | 39 | select * from page_views 40 | -------------------------------------------------------------------------------- /lambda-views/models/thought_experiment/sessions.sql: -------------------------------------------------------------------------------- 1 | {{ 2 | config( 3 | materialized = 'lambda_view', 4 | unique_key = 'session_id', 5 | historical_config = { 6 | 'materialized': 'incremental', 7 | 'schema': 'lambda_historical', 8 | 'alias': 'sessions' 9 | } 10 | ) 11 | }} 12 | 13 | with page_views as ( 14 | 15 | select * from {{ ref('page_views') }} 16 | 17 | {{ lambda_filter(column_name = 'collector_tstamp') }} 18 | 19 | ), 20 | 21 | sessions as ( 22 | 23 | select 24 | session_id, 25 | anonymous_user_id, 26 | 27 | count(*) as page_views, 28 | sum(approx_time_on_page) as total_time, 29 | min(page_view_start) as session_start, 30 | max(collector_tstamp) as collector_tstamp 31 | 32 | from page_views 33 | 34 | group by 1,2 35 | 36 | ) 37 | 38 | select * from sessions 39 | -------------------------------------------------------------------------------- /materialized-views/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /materialized-views/README.md: -------------------------------------------------------------------------------- 1 | ## dbt_labs_materialized_views 2 | 3 | `dbt_labs_materialized_views` is a dbt project containing materializations, helper macros, and some builtin macro overrides that enable use of materialized views in your dbt project. It takes a conceptual approach similar to that of the existing `incremental` materialization: 4 | - In a "full refresh" run, drop and recreate the MV from scratch. 5 | - Otherwise, "refresh" the MV as appropriate. Depending on the database, that could require DML (`refresh`) or no action. 6 | 7 | At any point, if the database object corresponding to a MV model exists instead as a table or standard view, dbt will attempt to drop it and recreate the model from scratch as a materialized view. 8 | 9 | Materialized views vary significantly across databases, as do their current limitations. Be sure to read the documentation for your adapter. 10 | 11 | If you're here, you may also like the [dbt-materialize](https://github.com/MaterializeInc/materialize/tree/main/misc/dbt-materialize) plugin, which enables dbt to materialize models as materialized views in [Materialize](https://materialize.io/). 12 | 13 | ## Setup 14 | 15 | ### General installation: 16 | 17 | You can install the materialized-view funcionality using one of the following methods. 18 | 19 | - Install this project as a package ([package-management docs](https://docs.getdbt.com/docs/building-a-dbt-project/package-management)) 20 | - [Local package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#local-packages): by referencing the [`materialized-views`](https://github.com/dbt-labs/dbt-labs-experimental-features/tree/master/materialized-views) folder. 21 | - [Git package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#git-packages) using [project subdirectories](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#git-packages): again by referencing the [`materialized-views`](https://github.com/dbt-labs/dbt-labs-experimental-features/tree/master/materialized-views) folder. 22 | - Copy-paste the files from `macros/` (specifically `default` and your adapter) into your own project. 23 | 24 | ### Extra installation steps for Postgres and Redshift 25 | 26 | The Postgres and Redshift implementations both require overriding the builtin versions of some adapter macros. If you've installed `dbt_labs_materialized_views` as a local package, you can achieve this override by creating a file `macros/*.sql` in your project with the following contents: 27 | 28 | ```sql 29 | {# postgres and redshift #} 30 | 31 | {% macro drop_relation(relation) -%} 32 | {{ return(dbt_labs_materialized_views.drop_relation(relation)) }} 33 | {% endmacro %} 34 | 35 | {% macro postgres__list_relations_without_caching(schema_relation) %} 36 | {{ return(dbt_labs_materialized_views.postgres__list_relations_without_caching(schema_relation)) }} 37 | {% endmacro %} 38 | 39 | {% macro postgres_get_relations() %} 40 | {{ return(dbt_labs_materialized_views.postgres_get_relations()) }} 41 | {% endmacro %} 42 | 43 | {# redshift only #} 44 | 45 | {% macro redshift__list_relations_without_caching(schema_relation) %} 46 | {{ return(dbt_labs_materialized_views.redshift__list_relations_without_caching(schema_relation)) }} 47 | {% endmacro %} 48 | 49 | {% macro load_relation(relation) %} 50 | {{ return(dbt_labs_materialized_views.redshift_load_relation_or_mv(relation)) }} 51 | {% endmacro %} 52 | ``` 53 | 54 | ## Postgres 55 | 56 | - Supported model configs: none 57 | - [docs](https://www.postgresql.org/docs/9.3/rules-materializedviews.html) 58 | 59 | ## Redshift 60 | 61 | - Supported model configs: `sort`, `dist`, `auto_refresh` 62 | - [docs](https://docs.aws.amazon.com/redshift/latest/dg/materialized-view-overview.html) 63 | - Anecdotally, `refresh materialized view ...` is very slow to run. By contrast, `auto_refresh` runs in the background, with minimal disruption to other workloads, at the risk of some small potential latency. 64 | - ❗ MVs do not support late binding, so if an underlying table is cascade-dropped, the MV will be dropped as well. This would be fine, except that MVs don't include their "true" dependencies in `pg_depend`. Instead, a materialized view claims to depend on a table relation called `mv_tbl__[MV_name]__0`, in place of the name of the true underlying table (https://github.com/awslabs/amazon-redshift-utils/issues/499). As such, dbt's runtime cache is unable to reliably know if a MV has been dropped when it cascade-drops the underlying table. This package requires an override of `load_relation()` to perform a "hard" check (database query of `stv_mv_info`) every time dbt's cache thinks a `materializedview` relation may already exist. 65 | - ❗ MVs do appear in `pg_views`, but the only way we can know that they're materialized views is that the `create materialized view` DDL appear in their `definition`, instead of just the SQL without DDL (standard views). There's another Redshift system table, `stv_mv_info`, but it can't effectively be joined with `pg_views` because they're different types of system tables. 66 | - ❗ If a column in the underlying table renamed, or removed and readded (e.g. varchar widening), the materialized view cannot be refreshed: 67 | ``` 68 | Database Error in model test_mv (models/test_mv.sql) 69 | Materialized view test_mv is unrefreshable as a column was renamed for a base table. 70 | compiled SQL at target/run/dbt_labs_experimental_features_integration_tests/test_mv.sql 71 | ``` 72 | 73 | ## BigQuery 74 | 75 | - Supported model configs: `auto_refresh`, `refresh_interval_minutes` 76 | - [docs](https://cloud.google.com/bigquery/docs/materialized-views-intro) 77 | - ❗ Although BQ does not have `drop ... cascade`, if the base table of a MV is dropped and recreated, the MV also needs to be dropped and recreated: 78 | ``` 79 | Materialized view dbt-dev-168022:dbt_jcohen.test_mv references table dbt-dev-168022:dbt_jcohen.base_tbl which was deleted and recreated. The view must be deleted and recreated as well. 80 | ``` 81 | 82 | ## Snowflake 83 | 84 | - Supported model configs: `secure`, `cluster_by`, `automatic_clustering`, `persist_docs` (relation only) 85 | - [docs](https://docs.snowflake.com/en/user-guide/views-materialized.html) 86 | - ❗ Note: Snowflake MVs are only enabled on enterprise accounts 87 | - ❗ Although Snowflake does not have `drop ... cascade`, if the base table table of a MV is dropped and recreated, the MV also needs to be dropped and recreated, otherwise the following error will appear: 88 | ``` 89 | Failure during expansion of view 'TEST_MV': SQL compilation error: Materialized View TEST_MV is invalid. 90 | ``` 91 | -------------------------------------------------------------------------------- /materialized-views/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'dbt_labs_materialized_views' 2 | version: '0.2.0' 3 | config-version: 2 4 | require-dbt-version: ">=1.0.0" 5 | 6 | model-paths: ["models"] 7 | analysis-paths: ["analysis"] 8 | test-paths: ["tests"] 9 | seed-paths: ["seed"] 10 | macro-paths: ["macros"] 11 | snapshot-paths: ["snapshots"] 12 | 13 | target-path: "target" 14 | clean-targets: 15 | - "target" 16 | - "dbt_modules" 17 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/Makefile: -------------------------------------------------------------------------------- 1 | test-postgres: 2 | dbt deps 3 | dbt seed --target postgres --full-refresh 4 | dbt run --target postgres --full-refresh --vars 'update: false' 5 | dbt run --target postgres --vars 'update: true' 6 | dbt test --target postgres 7 | 8 | test-redshift: 9 | dbt deps 10 | dbt seed --target redshift --full-refresh 11 | dbt run --target redshift --full-refresh --vars 'update: false' 12 | dbt run --target redshift --vars 'update: true' 13 | sleep 10 # wait for auto refresh 14 | dbt test --target redshift 15 | 16 | test-snowflake: 17 | dbt deps 18 | dbt seed --profile garage-snowflake --full-refresh 19 | dbt run --profile garage-snowflake --full-refresh --vars 'update: false' 20 | dbt run --profile garage-snowflake --vars 'update: true' 21 | dbt test --profile garage-snowflake 22 | 23 | test-bigquery: 24 | dbt deps 25 | dbt seed --target bigquery --full-refresh 26 | dbt run --target bigquery --full-refresh --vars 'update: false' 27 | dbt run --target bigquery --vars 'update: true' 28 | dbt test --target bigquery 29 | 30 | test-all: test-postgres test-redshift test-snowflake test-bigquery 31 | echo "Completed successfully" 32 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/dbt_project.yml: -------------------------------------------------------------------------------- 1 | 2 | name: 'dbt_labs_materialized_views_integration_tests' 3 | version: '0.2.0' 4 | config-version: 2 5 | 6 | profile: 'integration_tests' 7 | 8 | model-paths: ["models"] 9 | analysis-paths: ["analysis"] 10 | test-paths: ["tests"] 11 | seed-paths: ["seed"] 12 | macro-paths: ["macros"] 13 | 14 | target-path: "target" 15 | clean-targets: 16 | - "target" 17 | - "dbt_modules" 18 | 19 | quoting: 20 | identifier: false 21 | schema: false 22 | 23 | seeds: 24 | quote_columns: false 25 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/macros/overrides.sql: -------------------------------------------------------------------------------- 1 | {# postgres + redshift #} 2 | 3 | {% macro drop_relation(relation) -%} 4 | {{ return(dbt_labs_materialized_views.drop_relation(relation)) }} 5 | {% endmacro %} 6 | 7 | {% macro postgres__list_relations_without_caching(schema_relation) %} 8 | {{ return(dbt_labs_materialized_views.postgres__list_relations_without_caching(schema_relation)) }} 9 | {% endmacro %} 10 | 11 | {% macro postgres_get_relations() %} 12 | {{ return(dbt_labs_materialized_views.postgres_get_relations()) }} 13 | {% endmacro %} 14 | 15 | {# redshift only #} 16 | 17 | {% macro redshift__list_relations_without_caching(schema_relation) %} 18 | {{ return(dbt_labs_materialized_views.redshift__list_relations_without_caching(schema_relation)) }} 19 | {% endmacro %} 20 | 21 | {% macro load_relation(relation) %} 22 | {% if adapter.type() == 'redshift' %} 23 | {{ return(dbt_labs_materialized_views.redshift_load_relation_or_mv(relation)) }} 24 | {% else %} 25 | {{ return(dbt.load_relation(relation)) }} 26 | {% endif %} 27 | {% endmacro %} 28 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/models/base_tbl.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'incremental', 3 | unique_key = 'id' 4 | )}} 5 | 6 | -- depends on: {{ref('seed_update')}} 7 | -- depends on: {{ref('seed')}} 8 | 9 | {% if is_incremental() %} 10 | 11 | select * from {{ref('seed_update')}} 12 | 13 | {% else %} 14 | 15 | select * from {{ref('seed')}} 16 | 17 | {% endif %} 18 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/models/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | models: 4 | - name: test_mv_manual 5 | tests: 6 | - dbt_utils.equality: 7 | compare_model: ref('expected') 8 | - name: test_mv_auto 9 | tests: 10 | - dbt_utils.equality: 11 | compare_model: ref('expected') 12 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/models/test_mv_auto.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'materialized_view', 3 | auto_refresh = true 4 | )}} 5 | 6 | select 7 | 8 | gender, 9 | count(*) as num 10 | 11 | from {{ref('base_tbl')}} 12 | group by 1 13 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/models/test_mv_manual.sql: -------------------------------------------------------------------------------- 1 | {{config( 2 | materialized = 'materialized_view', 3 | auto_refresh = false 4 | )}} 5 | 6 | select 7 | 8 | gender, 9 | count(*) as num 10 | 11 | from {{ref('base_tbl')}} 12 | group by 1 13 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - local: ../ 3 | - package: fishtown-analytics/dbt_utils 4 | version: 0.6.4 5 | -------------------------------------------------------------------------------- /materialized-views/integration_tests/seed/expected.csv: -------------------------------------------------------------------------------- 1 | gender,num 2 | Female,6 3 | Male,4 -------------------------------------------------------------------------------- /materialized-views/integration_tests/seed/seed.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,email,gender,ip_address 2 | 1,Jacqueline,Hunter,jhunter0@pbs.org,Male,59.80.20.168 3 | 2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 4 | 3,Gerald,Ryan,gryan2@com.com,Male,11.3.212.243 5 | 4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 6 | 5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 -------------------------------------------------------------------------------- /materialized-views/integration_tests/seed/seed_update.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,email,gender,ip_address 2 | 1,Jacqueline,Hunter,jhunter0@pbs.org,Male,59.80.20.168 3 | 2,Kathryn,Walker,kwalker1@ezinearticles.com,Female,194.121.179.35 4 | 3,Gerald,Ryan,gryan2@com.com,Female,11.3.212.243 5 | 4,Bonnie,Spencer,bspencer3@ameblo.jp,Female,216.32.196.175 6 | 5,Harold,Taylor,htaylor4@people.com.cn,Male,253.10.246.136 7 | 6,Jack,Griffin,jgriffin5@t.co,Female,16.13.192.220 8 | 7,Wanda,Arnold,warnold6@google.nl,Female,232.116.150.64 9 | 8,Craig,Ortiz,cortiz7@sciencedaily.com,Male,199.126.106.13 10 | 9,Gary,Day,gday8@nih.gov,Male,35.81.68.186 11 | 10,Rose,Wright,rwright9@yahoo.co.jp,Female,236.82.178.100 -------------------------------------------------------------------------------- /materialized-views/macros/bigquery/adapters.sql: -------------------------------------------------------------------------------- 1 | {% macro bigquery_options() %} 2 | {%- set opts = kwargs -%} 3 | {%- set options -%} 4 | OPTIONS({% for opt_key, opt_val in kwargs.items() if opt_val is not none %} 5 | {{ opt_key }}={{ opt_val }}{{ "," if not loop.last }} 6 | {%- endfor -%}) 7 | {%- endset %} 8 | {%- do return(options) -%} 9 | {%- endmacro -%} 10 | 11 | {% macro bigquery__create_materialized_view_as(relation, sql, config) -%} 12 | 13 | {%- set enable_refresh = config.get('auto_refresh', none) -%} 14 | {%- set refresh_interval_minutes = config.get('refresh_interval_minutes', none) -%} 15 | {%- set sql_header = config.get('sql_header', none) -%} 16 | 17 | {{ sql_header if sql_header is not none }} 18 | 19 | create materialized view {{relation}} 20 | {{ dbt_labs_materialized_views.bigquery_options( 21 | enable_refresh=enable_refresh, 22 | refresh_interval_minutes=refresh_interval_minutes 23 | ) }} 24 | as ( 25 | {{sql}} 26 | ) 27 | 28 | {% endmacro %} 29 | 30 | 31 | {% macro bigquery__refresh_materialized_view(relation, config) -%} 32 | 33 | {%- set is_auto_refresh = config.get('auto_refresh', true) %} 34 | 35 | {%- if is_auto_refresh == false -%} {# manual refresh #} 36 | 37 | {% set refresh_command %} 38 | call bq.refresh_materialized_view('{{relation|replace("`","")}}') 39 | {% endset %} 40 | 41 | {%- do return(refresh_command) -%} 42 | 43 | {%- else -%} {# automatic refresh #} 44 | 45 | {%- do log("Skipping materialized view " ~ relation ~ " because it is set 46 | to refresh automatically") -%} 47 | 48 | {%- do return(none) -%} 49 | 50 | {%- endif -%} 51 | 52 | {% endmacro %} 53 | -------------------------------------------------------------------------------- /materialized-views/macros/bigquery/materialized_view.sql: -------------------------------------------------------------------------------- 1 | {% materialization materialized_view, adapter='bigquery' -%} 2 | 3 | {% set full_refresh_mode = (should_full_refresh()) %} 4 | 5 | {% set target_relation = this %} 6 | {% set existing_relation = load_relation(this) %} 7 | {% set tmp_relation = make_temp_relation(this) %} 8 | 9 | {{ run_hooks(pre_hooks) }} 10 | 11 | {% if existing_relation is none %} 12 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 13 | {% elif existing_relation.is_view or existing_relation.is_table %} 14 | {#-- Can't overwrite a view with a table - we must drop --#} 15 | {{ log("Dropping relation " ~ target_relation ~ " because it is a " ~ existing_relation.type ~ " and this model is a materialized view.") }} 16 | {% do adapter.drop_relation(existing_relation) %} 17 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 18 | {% elif full_refresh_mode %} 19 | {#-- create or replace not yet supported for materialized views --#} 20 | {{ log("Dropping relation " ~ target_relation ~ " because replacing an existing materialized view is not supported.") }} 21 | {% do adapter.drop_relation(existing_relation) %} 22 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 23 | {% else %} 24 | {% set build_sql = dbt_labs_materialized_views.refresh_materialized_view(target_relation, config) %} 25 | {% endif %} 26 | 27 | {% if build_sql %} 28 | {% call statement("main") %} 29 | {{ build_sql }} 30 | {% endcall %} 31 | {% else %} 32 | {{ store_result('main', 'SKIP') }} 33 | {% endif %} 34 | 35 | {{ run_hooks(post_hooks) }} 36 | 37 | {% do persist_docs(target_relation, model) %} 38 | 39 | {{ return({'relations': [target_relation]}) }} 40 | 41 | {%- endmaterialization %} 42 | -------------------------------------------------------------------------------- /materialized-views/macros/default/adapters.sql: -------------------------------------------------------------------------------- 1 | {% macro create_materialized_view_as(relation, sql, config) %} 2 | {{ return(adapter.dispatch('create_materialized_view_as', macro_namespace = 'dbt_labs_materialized_views')(relation, sql, config)) }} 3 | {% endmacro %} 4 | 5 | {% macro default__create_materialized_view_as(relation, sql, config) -%} 6 | 7 | create materialized view {{relation}} as ( 8 | {{sql}} 9 | ) 10 | 11 | {% endmacro %} 12 | 13 | {% macro refresh_materialized_view(relation, config) %} 14 | {{ return(adapter.dispatch('refresh_materialized_view', macro_namespace = 'dbt_labs_materialized_views')(relation, config)) }} 15 | {% endmacro %} 16 | 17 | {% macro default__refresh_materialized_view(relation, config) -%} 18 | 19 | refresh materialized view {{relation}} 20 | 21 | {% endmacro %} 22 | 23 | {# override builtin behavior of adapter.drop_relation #} 24 | {% macro drop_relation(relation) -%} 25 | {% set relation_type = 'materialized view' if relation.type == 'materializedview' else relation.type %} 26 | {% call statement('drop_relation', auto_begin=False) -%} 27 | drop {{ relation_type }} if exists {{ relation }} cascade 28 | {%- endcall %} 29 | {% endmacro %} 30 | -------------------------------------------------------------------------------- /materialized-views/macros/default/materialized_view.sql: -------------------------------------------------------------------------------- 1 | {% materialization materialized_view, default -%} 2 | 3 | {% set full_refresh_mode = (should_full_refresh()) %} 4 | 5 | {% set target_relation = this %} 6 | {% set existing_relation = load_relation(this) %} 7 | {% set tmp_relation = make_temp_relation(this) %} 8 | 9 | {{ run_hooks(pre_hooks, inside_transaction=False) }} 10 | 11 | -- `BEGIN` happens here: 12 | {{ run_hooks(pre_hooks, inside_transaction=True) }} 13 | 14 | {% set to_drop = [] %} 15 | 16 | {% if existing_relation is none %} 17 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 18 | 19 | {% elif full_refresh_mode or existing_relation.type != 'materializedview' %} 20 | {#-- Make sure the backup doesn't exist so we don't encounter issues with the rename below #} 21 | {% set backup_identifier = existing_relation.identifier ~ "__dbt_backup" %} 22 | {% set backup_relation = existing_relation.incorporate(path={"identifier": backup_identifier}) %} 23 | {% do adapter.drop_relation(backup_relation) %} 24 | 25 | {% do adapter.rename_relation(target_relation, backup_relation) %} 26 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 27 | {% do to_drop.append(backup_relation) %} 28 | 29 | {% else %} 30 | {% set build_sql = dbt_labs_materialized_views.refresh_materialized_view(target_relation, config) %} 31 | {% endif %} 32 | 33 | {% if build_sql %} 34 | 35 | {% call statement("main") %} 36 | {{ build_sql }} 37 | {% endcall %} 38 | 39 | {{ run_hooks(post_hooks, inside_transaction=True) }} 40 | 41 | {% do persist_docs(target_relation, model) %} 42 | 43 | -- `COMMIT` happens here 44 | {% do adapter.commit() %} 45 | 46 | {% else %} 47 | 48 | {{ store_result('main', 'SKIP') }} 49 | 50 | {% endif %} 51 | 52 | {% for rel in to_drop %} 53 | {% do adapter.drop_relation(rel) %} 54 | {% endfor %} 55 | 56 | {{ run_hooks(post_hooks, inside_transaction=False) }} 57 | 58 | {{ return({'relations': [target_relation]}) }} 59 | 60 | {%- endmaterialization %} 61 | -------------------------------------------------------------------------------- /materialized-views/macros/postgres/adapters.sql: -------------------------------------------------------------------------------- 1 | {% macro postgres__list_relations_without_caching(schema_relation) %} 2 | {% call statement('list_relations_without_caching', fetch_result=True) -%} 3 | select 4 | '{{ schema_relation.database }}' as database, 5 | tablename as name, 6 | schemaname as schema, 7 | 'table' as type 8 | from pg_tables 9 | where schemaname ilike '{{ schema_relation.schema }}' 10 | union all 11 | select 12 | '{{ schema_relation.database }}' as database, 13 | viewname as name, 14 | schemaname as schema, 15 | 'view' as type 16 | from pg_views 17 | where schemaname ilike '{{ schema_relation.schema }}' 18 | union all 19 | select 20 | '{{ schema_relation.database }}' as database, 21 | matviewname as name, 22 | schemaname as schema, 23 | 'materializedview' as type 24 | from pg_matviews 25 | where schemaname ilike '{{ schema_relation.schema }}' 26 | {% endcall %} 27 | {{ return(load_result('list_relations_without_caching').table) }} 28 | {% endmacro %} 29 | 30 | 31 | {% macro postgres_get_relations () -%} 32 | 33 | {# 34 | -- in pg_depend, objid is the dependent, refobjid is the referenced object 35 | -- > a pg_depend entry indicates that the referenced object cannot be 36 | -- > dropped without also dropping the dependent object. 37 | #} 38 | 39 | {%- call statement('relations', fetch_result=True) -%} 40 | with relation as ( 41 | select 42 | pg_rewrite.ev_class as class, 43 | pg_rewrite.oid as id 44 | from pg_rewrite 45 | ), 46 | class as ( 47 | select 48 | oid as id, 49 | relname as name, 50 | relnamespace as schema, 51 | relkind as kind 52 | from pg_class 53 | ), 54 | dependency as ( 55 | select 56 | pg_depend.objid as id, 57 | pg_depend.refobjid as ref 58 | from pg_depend 59 | ), 60 | schema as ( 61 | select 62 | pg_namespace.oid as id, 63 | pg_namespace.nspname as name 64 | from pg_namespace 65 | where nspname != 'information_schema' and nspname not like 'pg\_%' 66 | ), 67 | referenced as ( 68 | select 69 | relation.id AS id, 70 | referenced_class.name , 71 | referenced_class.schema , 72 | referenced_class.kind 73 | from relation 74 | join class as referenced_class on relation.class=referenced_class.id 75 | where referenced_class.kind in ('r', 'v', 'm') 76 | ), 77 | relationships as ( 78 | select 79 | referenced.name as referenced_name, 80 | referenced.schema as referenced_schema_id, 81 | dependent_class.name as dependent_name, 82 | dependent_class.schema as dependent_schema_id, 83 | referenced.kind as kind 84 | from referenced 85 | join dependency on referenced.id=dependency.id 86 | join class as dependent_class on dependency.ref=dependent_class.id 87 | where 88 | (referenced.name != dependent_class.name or 89 | referenced.schema != dependent_class.schema) 90 | ) 91 | 92 | select 93 | referenced_schema.name as referenced_schema, 94 | relationships.referenced_name as referenced_name, 95 | dependent_schema.name as dependent_schema, 96 | relationships.dependent_name as dependent_name 97 | from relationships 98 | join schema as dependent_schema on relationships.dependent_schema_id=dependent_schema.id 99 | join schema as referenced_schema on relationships.referenced_schema_id=referenced_schema.id 100 | group by referenced_schema, referenced_name, dependent_schema, dependent_name 101 | order by referenced_schema, referenced_name, dependent_schema, dependent_name; 102 | 103 | {%- endcall -%} 104 | 105 | {{ return(load_result('relations').table) }} 106 | {% endmacro %} 107 | -------------------------------------------------------------------------------- /materialized-views/macros/redshift/adapters.sql: -------------------------------------------------------------------------------- 1 | {% macro redshift__create_materialized_view_as(relation, sql, config) -%} 2 | 3 | {%- set _dist = config.get('dist') -%} 4 | {%- set _sort_type = config.get( 5 | 'sort_type', 6 | validator=validation.any['compound', 'interleaved']) -%} 7 | {%- set _sort = config.get( 8 | 'sort', 9 | validator=validation.any[list, basestring]) -%} 10 | {%- set sql_header = config.get('sql_header', none) -%} 11 | {%- set auto_refresh = 'yes' if config.get('auto_refresh', false) else 'no' %} 12 | 13 | {{ sql_header if sql_header is not none }} 14 | 15 | create materialized view {{ relation }} 16 | {{ dist(_dist) }} 17 | {{ sort(_sort_type, _sort) }} 18 | auto refresh {{ auto_refresh }} 19 | as ( 20 | {{ sql }} 21 | ); 22 | {%- endmacro %} 23 | 24 | 25 | {% macro redshift__refresh_materialized_view(relation, config) -%} 26 | 27 | {%- set is_auto_refresh = config.get('auto_refresh', true) %} 28 | 29 | {%- if is_auto_refresh == false -%} {# manual refresh #} 30 | 31 | refresh materialized view {{relation}} 32 | 33 | {%- else -%} {# automatic refresh #} 34 | 35 | {%- do log("Skipping materialized view " ~ relation ~ " because it is set 36 | to refresh automatically") -%} 37 | 38 | {%- do return(none) -%} 39 | 40 | {%- endif -%} 41 | 42 | {% endmacro %} 43 | 44 | 45 | {% macro redshift__list_relations_without_caching(schema_relation) %} 46 | {# 47 | pretty silly, but this is the best Redshift has given us. 48 | we effectively can't join stv_mv_info here, 49 | because they're different types of sytem tables (pg_ vs. stv_) 50 | #} 51 | 52 | {% call statement('list_relations_without_caching', fetch_result=True) -%} 53 | select 54 | '{{ schema_relation.database }}' as database, 55 | tablename as name, 56 | schemaname as schema, 57 | 'table' as type 58 | from pg_tables 59 | where schemaname ilike '{{ schema_relation.schema }}' 60 | union all 61 | select 62 | '{{ schema_relation.database }}' as database, 63 | viewname as name, 64 | schemaname as schema, 65 | case when definition ilike '%create materialized view%' 66 | then 'materializedview' 67 | else 'view' 68 | end as type 69 | from pg_views 70 | where schemaname ilike '{{ schema_relation.schema }}' 71 | {% endcall %} 72 | 73 | {{ return(load_result('list_relations_without_caching').table) }} 74 | {% endmacro %} 75 | 76 | 77 | {% macro redshift_load_relation_or_mv(relation) %} 78 | 79 | {% set rel = adapter.get_relation( 80 | database=relation.database, 81 | schema=relation.schema, 82 | identifier=relation.identifier 83 | ) -%} 84 | 85 | {% if rel.type == 'materializedview' and execute %} 86 | 87 | {# materialized views are not properly registered in pg_depend, 88 | so the cache can miss that they've been dropped 89 | https://github.com/awslabs/amazon-redshift-utils/issues/499 #} 90 | 91 | {% set hard_check_mv_sql %} 92 | 93 | select count(*) from stv_mv_info 94 | where schema = '{{ rel.schema }}' 95 | and name = '{{ rel.identifier }}' 96 | 97 | {% endset %} 98 | 99 | {% set result = run_query(hard_check_mv_sql)[0][0] %} 100 | {% set mv_rel = rel if result > 0 else none %} 101 | {% do return(mv_rel) %} 102 | 103 | {% else %} 104 | 105 | {% do return(rel) %} 106 | 107 | {% endif %} 108 | 109 | {% endmacro %} 110 | 111 | -------------------------------------------------------------------------------- /materialized-views/macros/snowflake/adapters.sql: -------------------------------------------------------------------------------- 1 | {% macro snowflake__create_materialized_view_as(relation, sql, config) -%} 2 | {%- set secure = config.get('secure', default=false) -%} 3 | {%- set cluster_by_keys = config.get('cluster_by', default=none) -%} 4 | {%- set enable_automatic_clustering = config.get('automatic_clustering', default=false) -%} 5 | {%- set sql_header = config.get('sql_header', none) -%} 6 | 7 | {%- if cluster_by_keys is not none and cluster_by_keys is string -%} 8 | {%- set cluster_by_keys = [cluster_by_keys] -%} 9 | {%- endif -%} 10 | {%- if cluster_by_keys is not none -%} 11 | {%- set cluster_by_string = cluster_by_keys|join(", ")-%} 12 | {% else %} 13 | {%- set cluster_by_string = none -%} 14 | {%- endif -%} 15 | 16 | {{ sql_header if sql_header is not none }} 17 | 18 | create or replace 19 | {% if secure -%} secure {%- endif %} 20 | materialized view {{relation}} 21 | as ( 22 | {{sql}} 23 | ); 24 | 25 | {% if cluster_by_string is not none and not temporary -%} 26 | alter materialized view {{relation}} cluster by ({{cluster_by_string}}); 27 | {%- endif -%} 28 | {% if enable_automatic_clustering and cluster_by_string is not none and not temporary -%} 29 | alter materialized view {{relation}} resume recluster; 30 | {%- endif -%} 31 | 32 | {% endmacro %} 33 | -------------------------------------------------------------------------------- /materialized-views/macros/snowflake/materialized_view.sql: -------------------------------------------------------------------------------- 1 | {% materialization materialized_view, adapter='snowflake' -%} 2 | 3 | {% set original_query_tag = set_query_tag() %} 4 | 5 | {% set full_refresh_mode = (should_full_refresh()) %} 6 | 7 | {% set target_relation = this %} 8 | {% set existing_relation = load_relation(this) %} 9 | {% set tmp_relation = make_temp_relation(this) %} 10 | 11 | {{ run_hooks(pre_hooks) }} 12 | 13 | {% if (existing_relation is none or full_refresh_mode) %} 14 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 15 | {% elif existing_relation.is_view or existing_relation.is_table %} 16 | {#-- Can't overwrite a view with a table - we must drop --#} 17 | {{ log("Dropping relation " ~ target_relation ~ " because it is a " ~ existing_relation.type ~ " and this model is a materialized view.") }} 18 | {% do adapter.drop_relation(existing_relation) %} 19 | {% set build_sql = dbt_labs_materialized_views.create_materialized_view_as(target_relation, sql, config) %} 20 | {% else %} 21 | {# noop #} 22 | {% endif %} 23 | 24 | {% if build_sql %} 25 | {% call statement("main") %} 26 | {{ build_sql }} 27 | {% endcall %} 28 | {% else %} 29 | {{ store_result('main', 'SKIP') }} 30 | {% endif %} 31 | 32 | {{ run_hooks(post_hooks) }} 33 | 34 | {% do persist_docs(target_relation, model) %} 35 | 36 | {% do unset_query_tag(original_query_tag) %} 37 | 38 | {{ return({'relations': [target_relation]}) }} 39 | 40 | {%- endmaterialization %} 41 | -------------------------------------------------------------------------------- /read-external-iceberg/.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | dbt_packages/ 3 | logs/ 4 | user.yml 5 | profiles.yml 6 | .DS_Store 7 | *.pyc 8 | __pycache__/ -------------------------------------------------------------------------------- /read-external-iceberg/README.md: -------------------------------------------------------------------------------- 1 | # Reading external, unmanaged Iceberg tables as Sources 2 | 3 | > [!WARNING] 4 | > This feature is experimental and subject to change at any time 5 | 6 | An experimental extension to [dbt-labs/dbt-external-tables](https://github.com/dbt-labs/dbt-external-tables) that adds support for creating Iceberg tables pointing to external catalogs unmanaged by the warehouse of a dbt project. 7 | 8 | for more context: see this discussion: [dbt-core#11171: Just the tip of the Iceberg](https://github.com/dbt-labs/dbt-core/discussions/11171) 9 | 10 | 11 | ## Supported databases 12 | 13 | * Snowflake 14 | 15 | ## Installation 16 | 17 | ### Install this project as a package ([package-management docs](https://docs.getdbt.com/docs/building-a-dbt-project/package-management)) 18 | - [Local package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#local-packages): by referencing this [`read-external-iceberg/`](https://github.com/dbt-labs/dbt-labs-experimental-features/tree/master/read-external-iceberg) folder. 19 | - [Git package](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#git-packages) using [project subdirectories](https://docs.getdbt.com/docs/building-a-dbt-project/package-management#git-packages): again by referencing the [`read-external-iceberg`](https://github.com/dbt-labs/dbt-labs-experimental-features/tree/master/read-external-iceberg) folder. 20 | 21 | ### Copy-paste the files from `macros/` into your own project 22 | 23 | specifically those in `plugins/snowflake/` 24 | 25 | 26 | ## Configuration 27 | 28 | You'll need some form of the below to make sure it works 29 | 30 | ```yml 31 | dispatch: 32 | - macro_namespace: dbt 33 | search_order: 34 | - 35 | - read_external_iceberg #if you're installing as a pacakge 36 | - dbt_external_tables 37 | - dbt 38 | ``` 39 | 40 | 41 | ## Usage 42 | 43 | The exact same as [dbt-labs/dbt-external-tables](https://github.com/dbt-labs/dbt-external-tables)! 44 | 45 | ## Sample usage 46 | 47 | 48 | ```yml 49 | version: 2 50 | sources: 51 | - name: snowplow 52 | database: analytics 53 | schema: snowplow_external 54 | loader: S3 55 | loaded_at_field: collector_hour 56 | 57 | tables: 58 | - name: my_iceberg_table 59 | description: | 60 | Iceberg table using an external AWS Glue or REST catalog 61 | Additional Details: https://docs.snowflake.com/en/sql-reference/sql/create-iceberg-table#external-iceberg-catalog 62 | external: 63 | table_format: iceberg 64 | # existing external volume 65 | external_volume: my_external_volume 66 | # existing catalog integration 67 | catalog: my_catalog_integration 68 | # name of the table in the external catalog 69 | catalog_table_name: my_iceberg_table 70 | # namespace of the namespace in the external catalog 71 | # Hint: in AWS Glue this is the "Database" 72 | catalog_namespace: my_iceberg_table_namespace 73 | # optional; Specifies whether to replace invalid UTF-8 characters withthe Unicode replacement character in query results 74 | replace_invalid_characters: true 75 | # optional; Specifies whether Snowflake should automatically poll the external Iceberg catalog 76 | # associated with the table for metadata updates when you use automated refresh 77 | auto_refresh: true 78 | # optional; Specifies a co 79 | ``` -------------------------------------------------------------------------------- /read-external-iceberg/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'read_external_iceberg' 2 | version: '1.0.0' 3 | config-version: 2 4 | 5 | profile: 'read_external_iceberg' 6 | 7 | model-paths: ["models"] 8 | analysis-paths: ["analyses"] 9 | test-paths: ["tests"] 10 | seed-paths: ["seeds"] 11 | macro-paths: ["macros"] 12 | snapshot-paths: ["snapshots"] 13 | 14 | target-path: "target" 15 | clean-targets: 16 | - "target" 17 | - "dbt_packages" 18 | -------------------------------------------------------------------------------- /read-external-iceberg/macros/plugins/snowflake/create_iceberg_source.sql: -------------------------------------------------------------------------------- 1 | {% macro snowflake_create_iceberg_source(source_node) %} 2 | 3 | {% set relation = api.Relation.create( 4 | database = source_node.database, 5 | schema = source_node.schema, 6 | identifier = source_node.identifier 7 | ) %} 8 | 9 | {% set required_configs = ['external_volume', 'catalog', 'catalog_table_name', 'catalog_namespace'] %} 10 | {% set optional_configs = ['replace_invalid_characters', 'auto_refresh', 'comment'] %} 11 | 12 | {% set ddl %} 13 | create or replace iceberg table {{ relation }} 14 | {% for config in required_configs %} 15 | {{ config }} = '{{ source_node.external.get(config) }}' 16 | {%- endfor -%} 17 | 18 | {% for config in optional_configs %} 19 | {% if config in source_node.external -%} 20 | 21 | {%- if source_node.external.get(config) is boolean -%} 22 | {{ config }} = {{ source_node.external.get(config) }} 23 | 24 | {%- else -%} 25 | {{ config }} = '{{ source_node.external.get(config) }}' 26 | {%- endif -%} 27 | 28 | {%- endif -%} 29 | {%- endfor -%} 30 | 31 | ; 32 | {% endset %} 33 | 34 | {{ ddl }} 35 | 36 | {% endmacro %} -------------------------------------------------------------------------------- /read-external-iceberg/macros/plugins/snowflake/get_external_build_plan.sql: -------------------------------------------------------------------------------- 1 | {% macro snowflake__get_external_build_plan(source_node) %} 2 | 3 | {% set build_plan = [] %} 4 | 5 | {% set old_relation = adapter.get_relation( 6 | database = source_node.database, 7 | schema = source_node.schema, 8 | identifier = source_node.identifier 9 | ) %} 10 | 11 | {% set create_or_replace = (old_relation is none or var('ext_full_refresh', false)) %} 12 | 13 | {% if source_node.external.get('table_format') == 'iceberg' %} 14 | 15 | {% set build_plan = build_plan + [ 16 | dbt_external_tables.create_external_schema(source_node), 17 | dbt_external_tables.snowflake_create_iceberg_source(source_node) 18 | ] %} 19 | 20 | {% elif source_node.external.get('snowpipe', none) is not none %} 21 | 22 | {% if create_or_replace %} 23 | {% set build_plan = build_plan + [ 24 | dbt_external_tables.create_external_schema(source_node), 25 | dbt_external_tables.snowflake_create_empty_table(source_node), 26 | dbt_external_tables.snowflake_get_copy_sql(source_node, explicit_transaction=true), 27 | dbt_external_tables.snowflake_create_snowpipe(source_node) 28 | ] %} 29 | {% else %} 30 | {% set build_plan = build_plan + dbt_external_tables.snowflake_refresh_snowpipe(source_node) %} 31 | {% endif %} 32 | 33 | {% else %} 34 | 35 | {% if create_or_replace %} 36 | {% set build_plan = build_plan + [ 37 | dbt_external_tables.create_external_schema(source_node), 38 | dbt_external_tables.create_external_table(source_node) 39 | ] %} 40 | {% else %} 41 | {% set build_plan = build_plan + dbt_external_tables.refresh_external_table(source_node) %} 42 | {% endif %} 43 | 44 | {% endif %} 45 | 46 | {% do return(build_plan) %} 47 | 48 | {% endmacro %} 49 | -------------------------------------------------------------------------------- /read-external-iceberg/packages.yml: -------------------------------------------------------------------------------- 1 | packages: 2 | - package: dbt-labs/dbt_external_tables 3 | version: 0.11.1 # Using a recent stable version -------------------------------------------------------------------------------- /snapshot-testing/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | target/ 3 | dbt_modules/ 4 | logs/ 5 | -------------------------------------------------------------------------------- /snapshot-testing/README.md: -------------------------------------------------------------------------------- 1 | # Using snapshots to detect dbt model regressions 2 | This dbt project is a worked example to demonstrate how to use snapshots to detect dbt model regressions. **Check out the full write-up [on Discourse](to-do).** 3 | 4 | The SQL in this project is compatible with Snowflake¹. 5 | 6 | If you want to run this project yourself to play with it (assuming you have 7 | dbt installed): 8 | 1. Clone this repo. 9 | 2. `cd` into this directory 10 | 2. Create a profile named `acme`, or update the `profile:` key in the `dbt_project.yml` file to point to an existing profile ([docs](https://docs.getdbt.com/docs/configure-your-profile)). 11 | 3. Run `dbt seed`. 12 | 4. Run `dbt snapshot`. 13 | 4. Run `dbt test` — no test failures should occur. 14 | 5. Run `dbt snapshot` a second time — on this run, a regression should be introduced. 15 | 6. Run `dbt test` to see the failure. 16 | 7. Run `dbt run-operation historic_revenue_snapshot_cleanup` to move the rogue record into an audit table. 17 | 8. Run `dbt test` again to see the healed failure. 18 | 19 | ----- 20 | 1. We decided to _not_ check that the SQL in this project is multi-warehouse compatible — it _might_ work on other warehouses! 21 | -------------------------------------------------------------------------------- /snapshot-testing/data/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/snapshot-testing/data/.gitkeep -------------------------------------------------------------------------------- /snapshot-testing/data/fct_orders.csv: -------------------------------------------------------------------------------- 1 | order_id,customer_id,order_date,amount 2 | 1,1,2020-01-01,10 3 | 2,3,2020-01-01,20 4 | 3,94,2020-01-02,1 5 | 4,50,2020-01-03,25 6 | 5,64,2020-01-03,17 7 | 6,54,2020-01-04,6 8 | -------------------------------------------------------------------------------- /snapshot-testing/dbt_project.yml: -------------------------------------------------------------------------------- 1 | name: 'acme' 2 | # config-version: 2 3 | version: 1.0.0 4 | require-dbt-version: ">=0.17.1" 5 | 6 | profile: acme 7 | 8 | source-paths: ["models"] 9 | analysis-paths: ["analysis"] 10 | test-paths: ["tests"] 11 | data-paths: ["data"] 12 | macro-paths: ["macros"] 13 | snapshot-paths: ["snapshots"] 14 | 15 | target-path: "target" # directory which will store compiled SQL files 16 | clean-targets: # directories to be removed by `dbt clean` 17 | - "target" 18 | - "dbt_modules" 19 | -------------------------------------------------------------------------------- /snapshot-testing/macros/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/snapshot-testing/macros/.gitkeep -------------------------------------------------------------------------------- /snapshot-testing/macros/historic_revenue_snapshot_cleanup.sql: -------------------------------------------------------------------------------- 1 | {% macro historic_revenue_snapshot_cleanup() %} 2 | {% set create_sql %} 3 | create table if not exists dbt_snapshots.historic_revenue_snapshot_invalidated as ( 4 | select 5 | *, 6 | current_timestamp as _inserted_at 7 | from {{ ref('historic_revenue_snapshot') }} 8 | limit 0 9 | ) 10 | {% endset %} 11 | {% set insert_sql %} 12 | insert into dbt_snapshots.historic_revenue_snapshot_invalidated ( 13 | select 14 | *, 15 | current_timestamp as _inserted_at 16 | from {{ ref('historic_revenue_snapshot') }} 17 | where dbt_valid_to is not null 18 | ); 19 | {% endset %} 20 | 21 | {% set delete_sql %} 22 | delete from {{ ref('historic_revenue_snapshot') }} where dbt_valid_to is not null 23 | {% endset %} 24 | 25 | {% do run_query('begin') %} 26 | {% do run_query(create_sql) %} 27 | {% do run_query(insert_sql) %} 28 | {% do run_query(delete_sql) %} 29 | {% do run_query('commit') %} 30 | 31 | {% endmacro %} 32 | -------------------------------------------------------------------------------- /snapshot-testing/macros/test_is_null.sql: -------------------------------------------------------------------------------- 1 | {% macro test_is_null(model) %} 2 | 3 | {% set column_name = kwargs.get('column_name', kwargs.get('arg')) %} 4 | 5 | select count(*) as validation_errors 6 | from {{ model }} 7 | where not({{ column_name }} is null) 8 | 9 | {% endmacro %} 10 | -------------------------------------------------------------------------------- /snapshot-testing/snapshots/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/snapshot-testing/snapshots/.gitkeep -------------------------------------------------------------------------------- /snapshot-testing/snapshots/historic_revenue_snapshot.sql: -------------------------------------------------------------------------------- 1 | {% snapshot historic_revenue_snapshot %} 2 | 3 | {{ 4 | config( 5 | target_schema='dbt_snapshots', 6 | strategy='check', 7 | unique_key='date_day', 8 | check_cols=['total_revenue'] 9 | ) 10 | }} 11 | 12 | select 13 | order_date as date_day, 14 | sum(amount) as total_revenue 15 | from {{ ref('fct_orders') }} 16 | 17 | {# we're going to use this hack to make a record disappear on the second run of this #} 18 | {% if adapter.get_relation(this.database, this.schema, this.table) is not none %} 19 | where order_id != 4 20 | {% endif %} 21 | 22 | group by 1 23 | 24 | 25 | {% endsnapshot %} 26 | -------------------------------------------------------------------------------- /snapshot-testing/snapshots/schema.yml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | snapshots: 4 | - name: historic_revenue_snapshot 5 | columns: 6 | - name: dbt_valid_to 7 | tests: 8 | - is_null 9 | -------------------------------------------------------------------------------- /snapshot-testing/tests/.gitkeep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dbt-labs/dbt-labs-experimental-features/458f0a49f165e55f5dcac45e54226f215fda3d07/snapshot-testing/tests/.gitkeep --------------------------------------------------------------------------------