├── .env.example ├── .github ├── CODEOWNERS └── workflows │ ├── lint.yaml │ ├── release.yaml │ └── test.yaml ├── .gitignore ├── CHANGELOG.md ├── LEGAL.md ├── LICENSE ├── Makefile ├── README.md ├── app.py ├── app_utils ├── __init__.py ├── chat.py ├── looker_sdk.zip ├── shared_utils.py └── strictyaml.zip ├── artifacts ├── SMG_DEMO.csv └── customers.yml ├── environment.yml ├── images ├── dbt-signature_tm_black.png ├── error39.png └── looker.png ├── journeys ├── __init__.py ├── builder.py ├── evaluation.py ├── iteration.py ├── joins.py └── partner.py ├── mypy.ini ├── partner ├── __init__.py ├── cortex.py ├── dbt.py ├── looker.py └── partner_utils.py ├── poetry.lock ├── pyproject.toml ├── semantic_model_generator ├── __init__.py ├── data_processing │ ├── __init__.py │ ├── cte_utils.py │ ├── cte_utils_test.py │ ├── data_types.py │ └── proto_utils.py ├── generate_model.py ├── output_models │ └── .keep ├── protos │ ├── semantic_model.proto │ ├── semantic_model_pb2.py │ └── semantic_model_pb2.pyi ├── snowflake_utils │ ├── env_vars.py │ ├── snowflake_connector.py │ └── utils.py ├── tests │ ├── cte_utils_test.py │ ├── generate_model_test.py │ ├── samples │ │ └── validate_yamls.py │ ├── snowflake_connector_test.py │ ├── utils_test.py │ ├── validate_model_test.py │ └── yaml_to_semantic_model_test.py ├── validate │ ├── context_length.py │ ├── keywords.py │ └── schema.py └── validate_model.py └── sis_setup ├── app_setup.sql ├── looker_integration.sql └── sissetup_snowsightgit.sql /.env.example: -------------------------------------------------------------------------------- 1 | # Example config for username/password auth 2 | SNOWFLAKE_ROLE="" 3 | SNOWFLAKE_WAREHOUSE="" 4 | SNOWFLAKE_USER="" 5 | SNOWFLAKE_PASSWORD="" 6 | SNOWFLAKE_ACCOUNT_LOCATOR="" 7 | SNOWFLAKE_HOST="" 8 | 9 | 10 | # Example config for externalbrowser auth 11 | SNOWFLAKE_ROLE="" 12 | SNOWFLAKE_WAREHOUSE="" 13 | SNOWFLAKE_USER="" 14 | SNOWFLAKE_PASSWORD="" 15 | SNOWFLAKE_ACCOUNT_LOCATOR="" 16 | SNOWFLAKE_HOST="" 17 | SNOWFLAKE_AUTHENTICATOR="externalbrowser" 18 | 19 | 20 | # Example config for username/password auth using MFA 21 | SNOWFLAKE_ROLE="" 22 | SNOWFLAKE_WAREHOUSE="" 23 | SNOWFLAKE_USER="" 24 | SNOWFLAKE_PASSWORD="" 25 | SNOWFLAKE_ACCOUNT_LOCATOR="" 26 | SNOWFLAKE_HOST="" 27 | SNOWFLAKE_AUTHENTICATOR="username_password_mfa" 28 | SNOWFLAKE_MFA_PASSCODE="" 29 | -------------------------------------------------------------------------------- /.github/CODEOWNERS: -------------------------------------------------------------------------------- 1 | * @sfc-gh-cnivera @sfc-gh-jsummer 2 | /semantic_model_generator/ @sfc-gh-rehuang @sfc-gh-cnivera @sfc-gh-jsummer 3 | -------------------------------------------------------------------------------- /.github/workflows/lint.yaml: -------------------------------------------------------------------------------- 1 | name: Semantic Model Format & Lint 2 | 3 | on: 4 | pull_request: 5 | branches: 6 | - "*" 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | strategy: 12 | matrix: 13 | python-version: [ "3.10" ] 14 | 15 | steps: 16 | - name: Check out the code 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | - name: Cache Poetry virtualenv 25 | uses: actions/cache@v4 26 | with: 27 | path: ~/.cache/pypoetry/virtualenvs 28 | key: ${{ runner.os }}-poetry-${{ hashFiles('poetry.lock') }} 29 | restore-keys: | 30 | ${{ runner.os }}-poetry- 31 | 32 | - name: Install Poetry 33 | run: | 34 | python3 -m pip install --user pipx 35 | python3 -m pipx ensurepath 36 | pipx install poetry 37 | 38 | - name: Configure Poetry 39 | run: | 40 | export PATH="$HOME/.local/bin:$PATH" 41 | poetry config virtualenvs.create false 42 | 43 | - name: Install dependencies using Poetry 44 | run: | 45 | poetry install --no-interaction 46 | 47 | - name: Run mypy 48 | id: mypy 49 | run: | 50 | make run_mypy 51 | continue-on-error: true 52 | 53 | - name: Check with black 54 | id: black 55 | run: | 56 | make check_black 57 | continue-on-error: true 58 | 59 | - name: Check with isort 60 | id: isort 61 | run: | 62 | make check_isort 63 | continue-on-error: true 64 | 65 | - name: Run flake8 66 | id: flake8 67 | run: | 68 | make run_flake8 69 | continue-on-error: true 70 | 71 | - name: Report failures 72 | run: | 73 | if [ "${{ steps.black.outcome }}" != "success" ]; then echo "black failed"; FAIL=1; fi 74 | if [ "${{ steps.isort.outcome }}" != "success" ]; then echo "isort failed"; FAIL=1; fi 75 | if [ "${{ steps.flake8.outcome }}" != "success" ]; then echo "flake8 failed"; FAIL=1; fi 76 | if [ "$FAIL" == "1" ]; then exit 1; fi 77 | continue-on-error: false 78 | -------------------------------------------------------------------------------- /.github/workflows/release.yaml: -------------------------------------------------------------------------------- 1 | name: Build and Attach Wheel to GitHub Release 2 | 3 | on: 4 | push: 5 | tags: 6 | - 'release/v*' 7 | 8 | jobs: 9 | build: 10 | runs-on: ubuntu-latest 11 | steps: 12 | - uses: actions/checkout@v4 13 | - name: Set up Python 14 | uses: actions/setup-python@v4 15 | with: 16 | python-version: '3.10' 17 | - name: Install Poetry 18 | run: pip install poetry 19 | - name: Get the version from pyproject.toml 20 | run: echo "VERSION=$(poetry version -s)" >> $GITHUB_ENV 21 | - name: Build Wheel 22 | run: poetry build -f wheel 23 | - name: Extract Changelog for the Version 24 | run: | 25 | VERSION=${{ env.VERSION }} 26 | CHANGELOG=$(awk '/^## \['"${VERSION//./\\.}"'\]/ {flag=1; next} /^## \[/ {flag=0} flag' CHANGELOG.md) 27 | echo "CHANGELOG<> $GITHUB_ENV 28 | echo "$CHANGELOG" >> $GITHUB_ENV 29 | echo "EOF" >> $GITHUB_ENV 30 | - name: Upload Wheel to Release 31 | uses: softprops/action-gh-release@v1 32 | with: 33 | files: dist/*.whl 34 | body: ${{ env.CHANGELOG }} 35 | env: 36 | GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} 37 | -------------------------------------------------------------------------------- /.github/workflows/test.yaml: -------------------------------------------------------------------------------- 1 | name: Semantic Model Generator Test 2 | 3 | on: 4 | pull_request: 5 | paths: 6 | - "semantic_model_generator/**" 7 | - "pyproject.toml" 8 | 9 | jobs: 10 | build: 11 | runs-on: ubuntu-latest 12 | strategy: 13 | matrix: 14 | python-version: [ "3.10" ] 15 | steps: 16 | - name: Check out the code 17 | uses: actions/checkout@v4 18 | 19 | - name: Set up Python ${{ matrix.python-version }} 20 | uses: actions/setup-python@v5 21 | with: 22 | python-version: ${{ matrix.python-version }} 23 | 24 | # Caching dependencies using Poetry 25 | - name: Cache Poetry virtualenv 26 | uses: actions/cache@v4 27 | with: 28 | path: ~/.cache/pypoetry/virtualenvs 29 | key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }} 30 | restore-keys: | 31 | ${{ runner.os }}-poetry- 32 | 33 | - name: Install Poetry 34 | run: | 35 | curl -sSL https://install.python-poetry.org | python3 - 36 | 37 | - name: Configure Poetry 38 | run: | 39 | $HOME/.local/bin/poetry config virtualenvs.create false 40 | 41 | - name: Install dependencies using Poetry 42 | run: | 43 | $HOME/.local/bin/poetry install --no-interaction 44 | 45 | - name: Test 46 | run: | 47 | make test_github_workflow 48 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Local python environment 2 | pyvenv 3 | *.DS_Store 4 | 5 | # Notebook intermediate state 6 | *.ipynb_checkpoints 7 | 8 | # Mypy 9 | .mypy_cache 10 | 11 | # Pytest 12 | .pytest_cache 13 | 14 | # pycache 15 | **/__pycache__ 16 | 17 | # Python package builds 18 | *.egg-info 19 | 20 | # VSCode 21 | .vscode/settings.json 22 | .vscode/launch.json 23 | .vscode/.ropeproject 24 | .vscode/*.log 25 | .vscode/*.json 26 | 27 | # Jetbrains 28 | .idea/* 29 | 30 | # Envs 31 | .env 32 | .venv 33 | .direnv 34 | .envrc 35 | 36 | # Output semantic models 37 | semantic_model_generator/output_models/*.yaml 38 | 39 | # test coverage 40 | .coverage -------------------------------------------------------------------------------- /CHANGELOG.md: -------------------------------------------------------------------------------- 1 | # Changelog 2 | 3 | You must follow the format of `## [VERSION-NUMBER]` for the GitHub workflow to pick up the text. 4 | 5 | ## [0.1.33] - 2024-08-07 6 | 7 | ### Updates 8 | 9 | - Throw an error during validation if a user adds duplicate verified queries to their semantic model. 10 | 11 | ## [0.1.32] - 2024-07-30 12 | 13 | ### Updates 14 | 15 | - Bump context length validation limit. 16 | - Fix union type hints for support with Python <3.10. 17 | 18 | ## [0.1.31] - 2024-07-29 19 | 20 | ### Updates 21 | 22 | - Include new `secure-local-storage` extra package for `snowflake-python-connector` dependency. 23 | 24 | ## [0.1.30] - 2024-07-12 25 | 26 | ### Updates 27 | 28 | - Restrict Python version to < 3.12 in order to avoid issues with pyarrow dependency. 29 | 30 | ## [0.1.29] - 2024-07-10 31 | 32 | ### Updates 33 | 34 | - Allow single sign on auth. 35 | 36 | ## [0.1.28] - 2024-07-09 37 | 38 | ### Updates 39 | 40 | - Allow auto-generation of descriptions for semantic models. 41 | 42 | ## [0.1.27] - 2024-07-03 43 | 44 | ### Updates 45 | 46 | - Fix VQR validation for measures with aggregation calculation. 47 | - Update pulling sample value by dimension vs. measures; fix length validation logic. 48 | 49 | ## [0.1.26] - 2024-07-02 50 | 51 | ### Updates 52 | 53 | - Semantic model size validation allows for many more sample values. 54 | This corresponds with a release of the Cortex Analyst that does dynamic sample value retrieval by default. 55 | 56 | ## [0.1.25] - 2024-06-18 57 | 58 | ### Updates 59 | 60 | - Plumb through column and table comments 61 | - Skip host name match verification for now 62 | 63 | ## [0.1.24] - 2024-06-17 64 | 65 | ### Updates 66 | 67 | - Consolidate validations to use the same set of utils 68 | - Handle the validation for expr with aggregations properly 69 | 70 | ## [0.1.23] - 2024-06-13 71 | 72 | ### Updates 73 | 74 | - Remove VQR from context length calculation. 75 | - Add toggle for number of sample values. 76 | 77 | ## [0.1.22] - 2024-06-11 78 | 79 | ### Updates 80 | 81 | - Fix small streamlit app components to be compatible with python 3.8 82 | 83 | ## [0.1.21] - 2024-06-10 84 | 85 | ### Updates 86 | 87 | - Add validation for verified queries; 88 | - Add streamlit admin app for semantic model generation, validation and verified query flow. 89 | 90 | ## [0.1.20] - 2024-05-31 91 | 92 | ### Updates 93 | 94 | - Fix for validation CLI and README 95 | 96 | ## [0.1.19] - 2024-05-31 97 | 98 | ### Updates 99 | 100 | - Fix protobuf version to be compatible with streamlit 101 | - Small refactor in validation file 102 | 103 | ## [0.1.18] - 2024-05-31 104 | 105 | ### Updates 106 | 107 | - Add proto definition for verified queries; also add proto for Column (for backward compatibility only) 108 | 109 | ## [0.1.17] - 2024-05-21 110 | 111 | ### Updates 112 | 113 | - Allow flow style in yaml validation 114 | 115 | ## [0.1.16] - 2024-05-15 116 | 117 | ### Updates 118 | 119 | - Remove validation of context length to after save. 120 | - Uppercase db/schema/table(s) 121 | 122 | ## [0.1.15] - 2024-05-14 123 | 124 | ### Updates 125 | 126 | - Use strictyaml to validate the semantic model yaml matches the expected schema and has all required fields 127 | 128 | ## [0.1.14] - 2024-05-13 129 | 130 | ### Updates 131 | 132 | - Fix aggregations 133 | - Context limit 134 | 135 | ## [0.1.13] - 2024-05-08 136 | 137 | ### Updates 138 | 139 | - Object types not supported in generation or validation. 140 | 141 | ## [0.1.12] - 2024-05-03 142 | 143 | ### Updates 144 | 145 | - Naming 146 | - Validate no expressions in cols in yaml 147 | 148 | ## [0.1.11] - 2024-05-01 149 | 150 | ### Updates 151 | 152 | - Save path location 153 | 154 | ## [0.1.10] - 2024-05-01 155 | 156 | ### Updates 157 | 158 | - Save path location 159 | 160 | ## [0.1.9] - 2024-04-29 161 | 162 | ### Updates 163 | 164 | - Add additional validation for mismatched quotes. Test incorrect enums. 165 | 166 | ## [0.1.8] - 2024-04-23 167 | 168 | ### Updates 169 | 170 | - run select against given cols in semantic model for validation 171 | 172 | ## [0.1.7] - 2024-04-18 173 | 174 | ### Updates 175 | 176 | - Parse yaml model into protos, validate cols and col naming 177 | 178 | ## [0.1.6] - 2024-04-16 179 | 180 | ### Updates 181 | 182 | - First yaml validation included. 183 | 184 | ## [0.1.5] - 2024-04-15d 185 | 186 | ### Updates 187 | 188 | - Downgrade pyarrow 189 | 190 | ## [0.1.4] - 2024-04-15c 191 | 192 | ### Updates 193 | 194 | - Spacing typo 195 | 196 | ## [0.1.3] - 2024-04-15b 197 | 198 | ### Updates 199 | 200 | - Fix 3.8 typing 201 | - Some function renaming 202 | - Support all Snowflake datatypes 203 | 204 | ## [0.1.2] - 2024-04-15 205 | 206 | ### Updates 207 | 208 | - Downgrade to python 3.8 and resolve typing issues with optional. 209 | - Fix FQN parts for pydantic errors. 210 | - Update README to be less restrictive for installs. 211 | 212 | ## [0.1.1] - 2024-04-09 213 | 214 | ### Released 215 | 216 | - Verify release workflow works as intended 217 | 218 | ## [0.1.0] - 2024-04-08 219 | 220 | ### Released 221 | 222 | - Initial release of the project. 223 | -------------------------------------------------------------------------------- /LEGAL.md: -------------------------------------------------------------------------------- 1 | This application is not part of the Snowflake Service and is governed by the terms in LICENSE, unless expressly agreed to in writing. You use this application at your own risk, and Snowflake has no obligation to support your use of this application. 2 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY: run_admin_app 2 | 3 | install-poetry: 4 | curl -sSL https://install.python-poetry.org | python3 - 5 | 6 | install-homebrew: 7 | /bin/bash -c "$$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" 8 | 9 | install-pyenv: 10 | @command -v brew >/dev/null 2>&1 || $(MAKE) install-homebrew 11 | brew install pyenv 12 | 13 | install-python-3.8: 14 | @echo "Python 3.8 not found. Installing Python 3.8 using pyenv." 15 | @pyenv install 3.8 16 | @pyenv local 3.8 17 | 18 | check-deps: 19 | @command -v poetry >/dev/null 2>&1 || $(MAKE) install-poetry 20 | 21 | 22 | shell: check-deps ## Get into a poetry shell 23 | poetry shell 24 | 25 | setup: check-deps shell ## Install dependencies into your poetry environment. 26 | poetry install 27 | 28 | # app 29 | run_admin_app: 30 | python -m streamlit run app.py 31 | 32 | # Installs dependencies for the admin app. 33 | setup_admin_app: 34 | pip install . 35 | 36 | # Linting and formatting below. 37 | run_mypy: ## Run mypy 38 | mypy --config-file=mypy.ini . 39 | 40 | run_flake8: ## Run flake8 41 | flake8 --ignore=E203,E501,W503 --exclude=venv,.venv,pyvenv,tmp,*_pb2.py,*_pb2.pyi,images/*/src . 42 | 43 | check_black: ## Check to see if files would be updated with black. 44 | # Exclude pyvenv and all generated protobuf code. 45 | black --check --exclude=".venv|venv|pyvenv|.*_pb2.py|.*_pb2.pyi" . 46 | 47 | run_black: ## Run black to format files. 48 | # Exclude pyvenv, tmp, and all generated protobuf code. 49 | black --exclude=".venv|venv|pyvenv|tmp|.*_pb2.py|.*_pb2.pyi" . 50 | 51 | check_isort: ## Check if files would be updated with isort. 52 | isort --profile black --check --skip=venv --skip=pyvenv --skip=.venv --skip-glob='*_pb2.py*' . 53 | 54 | run_isort: ## Run isort to update imports. 55 | isort --profile black --skip=pyvenv --skip=venv --skip=tmp --skip=.venv --skip-glob='*_pb2.py*' . 56 | 57 | 58 | fmt_lint: shell ## lint/fmt in current python environment 59 | make run_black run_isort run_flake8 60 | 61 | # Test below 62 | test: shell ## Run tests. 63 | python -m pytest -vvs semantic_model_generator 64 | 65 | test_github_workflow: ## For use on github workflow. 66 | python -m pytest -vvs semantic_model_generator 67 | 68 | # Release 69 | update-version: ## Bump poetry and github version. TYPE should be `patch` `minor` or `major` 70 | @echo "Updating Poetry version ($(TYPE)) and creating a Git tag..." 71 | @poetry version $(TYPE) 72 | @echo "Version updated to $$VERSION. Update the CHANGELOG.md `make release`" 73 | 74 | release: ## Runs the release workflow. 75 | @VERSION=$$(poetry version -s) && git commit --allow-empty -m "Bump version to $$VERSION" && git tag release/v$$VERSION && \ 76 | git push origin HEAD && git push origin HEAD --tags 77 | 78 | build: ## Clean the dist dir and build the whl file 79 | rm -rf dist 80 | mkdir dist 81 | poetry build 82 | 83 | help: ## Show this help. 84 | @fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from snowflake.connector import DatabaseError 3 | from snowflake.connector.connection import SnowflakeConnection 4 | 5 | # set_page_config must be run as the first Streamlit command on the page, before any other streamlit imports. 6 | st.set_page_config(layout="wide", page_icon="💬", page_title="Semantic Model Generator") 7 | 8 | from app_utils.shared_utils import ( # noqa: E402 9 | GeneratorAppScreen, 10 | get_snowflake_connection, 11 | set_account_name, 12 | set_host_name, 13 | set_sit_query_tag, 14 | set_snowpark_session, 15 | set_streamlit_location, 16 | set_user_name, 17 | ) 18 | from semantic_model_generator.snowflake_utils.env_vars import ( # noqa: E402 19 | SNOWFLAKE_ACCOUNT_LOCATOR, 20 | SNOWFLAKE_HOST, 21 | SNOWFLAKE_USER, 22 | ) 23 | 24 | 25 | @st.experimental_dialog(title="Connection Error") 26 | def failed_connection_popup() -> None: 27 | """ 28 | Renders a dialog box detailing that the credentials provided could not be used to connect to Snowflake. 29 | """ 30 | st.markdown( 31 | """It looks like the credentials provided could not be used to connect to the account.""" 32 | ) 33 | st.stop() 34 | 35 | 36 | def verify_environment_setup() -> SnowflakeConnection: 37 | """ 38 | Ensures that the correct environment variables are set before proceeding. 39 | """ 40 | 41 | # Instantiate the Snowflake connection that gets reused throughout the app. 42 | try: 43 | with st.spinner( 44 | "Validating your connection to Snowflake. If you are using MFA, please check your authenticator app for a push notification." 45 | ): 46 | return get_snowflake_connection() 47 | except DatabaseError: 48 | failed_connection_popup() 49 | 50 | 51 | if __name__ == "__main__": 52 | from journeys import builder, iteration, partner 53 | 54 | st.session_state["sis"] = set_streamlit_location() 55 | 56 | def onboarding_dialog() -> None: 57 | """ 58 | Renders the initial screen where users can choose to create a new semantic model or edit an existing one. 59 | """ 60 | 61 | # Direct to specific page based instead of default onboarding if user comes from successful partner setup 62 | st.markdown( 63 | """ 64 |
65 |

Welcome to the Snowflake Semantic Model Generator! ❄️

66 |

⚠️ Heads up! The Streamlit app is no longer supported for semantic model creation. 67 |

👉 Please use the Snowsight UI in Snowflake to create and update semantic models — it’s newer and works better!

68 |

✅ Once your model is created in Snowsight, come back here to run evaluations, which still work best in this app.

69 |
70 | """, 71 | unsafe_allow_html=True, 72 | ) 73 | 74 | st.markdown("
", unsafe_allow_html=True) 75 | 76 | _, center, _ = st.columns([1, 2, 1]) 77 | with center: 78 | if st.button( 79 | "**[⚠️ Deprecated]🛠 Create a new semantic model**", 80 | use_container_width=True, 81 | type="primary", 82 | ): 83 | builder.show() 84 | st.markdown("") 85 | if st.button( 86 | "**✏️ Edit an existing semantic model**", 87 | use_container_width=True, 88 | type="primary", 89 | ): 90 | iteration.show() 91 | st.markdown("") 92 | if st.button( 93 | "**[⚠️ Deprecated]📦 Start with partner semantic model**", 94 | use_container_width=True, 95 | type="primary", 96 | ): 97 | set_sit_query_tag( 98 | get_snowflake_connection(), 99 | vendor="", 100 | action="start", 101 | ) 102 | partner.show() 103 | 104 | conn = verify_environment_setup() 105 | set_snowpark_session(conn) 106 | 107 | # Populating common state between builder and iteration apps. 108 | set_account_name(conn, SNOWFLAKE_ACCOUNT_LOCATOR) 109 | set_host_name(conn, SNOWFLAKE_HOST) 110 | set_user_name(conn, SNOWFLAKE_USER) 111 | 112 | # When the app first loads, show the onboarding screen. 113 | if "page" not in st.session_state: 114 | st.session_state["page"] = GeneratorAppScreen.ONBOARDING 115 | 116 | # Depending on the page state, we either show the onboarding menu or the chat app flow. 117 | # The builder flow is simply an intermediate dialog before the iteration flow. 118 | if st.session_state["page"] == GeneratorAppScreen.ITERATION: 119 | iteration.show() 120 | else: 121 | onboarding_dialog() 122 | -------------------------------------------------------------------------------- /app_utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/app_utils/__init__.py -------------------------------------------------------------------------------- /app_utils/chat.py: -------------------------------------------------------------------------------- 1 | import json 2 | import re 3 | from typing import Any, Dict 4 | 5 | import requests 6 | import streamlit as st 7 | from snowflake.connector import SnowflakeConnection 8 | 9 | API_ENDPOINT = "https://{HOST}/api/v2/cortex/analyst/message" 10 | 11 | 12 | @st.cache_data(ttl=60, show_spinner=False) 13 | def send_message( 14 | _conn: SnowflakeConnection, semantic_model: str, messages: list[dict[str, str]] 15 | ) -> Dict[str, Any]: 16 | """ 17 | Calls the REST API with a list of messages and returns the response. 18 | Args: 19 | _conn: SnowflakeConnection, used to grab the token for auth. 20 | messages: list of chat messages to pass to the Analyst API. 21 | semantic_model: stringified YAML of the semantic model. 22 | 23 | Returns: The raw ChatMessage response from Analyst. 24 | """ 25 | request_body = { 26 | "messages": messages, 27 | "semantic_model": semantic_model, 28 | } 29 | 30 | if st.session_state["sis"]: 31 | import _snowflake 32 | 33 | resp = _snowflake.send_snow_api_request( # type: ignore 34 | "POST", 35 | "/api/v2/cortex/analyst/message", 36 | {}, 37 | {}, 38 | request_body, 39 | {}, 40 | 30000, 41 | ) 42 | if resp["status"] < 400: 43 | json_resp: Dict[str, Any] = json.loads(resp["content"]) 44 | return json_resp 45 | else: 46 | err_body = json.loads(resp["content"]) 47 | if "message" in err_body: 48 | # Certain errors have a message payload with a link to the github repo, which we should remove. 49 | error_msg = re.sub( 50 | r"\s*Please use https://github\.com/Snowflake-Labs/semantic-model-generator.*", 51 | "", 52 | err_body["message"], 53 | ) 54 | raise ValueError(error_msg) 55 | raise ValueError(err_body) 56 | 57 | else: 58 | host = st.session_state.host_name 59 | resp = requests.post( 60 | API_ENDPOINT.format( 61 | HOST=host, 62 | ), 63 | json=request_body, 64 | headers={ 65 | "Authorization": f'Snowflake Token="{_conn.rest.token}"', # type: ignore[union-attr] 66 | "Content-Type": "application/json", 67 | }, 68 | ) 69 | if resp.status_code < 400: 70 | json_resp: Dict[str, Any] = resp.json() 71 | return json_resp 72 | else: 73 | err_body = json.loads(resp.text) 74 | if "message" in err_body: 75 | # Certain errors have a message payload with a link to the github repo, which we should remove. 76 | error_msg = re.sub( 77 | r"\s*Please use https://github\.com/Snowflake-Labs/semantic-model-generator.*", 78 | "", 79 | err_body["message"], 80 | ) 81 | raise ValueError(error_msg) 82 | raise ValueError(err_body) 83 | -------------------------------------------------------------------------------- /app_utils/looker_sdk.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/app_utils/looker_sdk.zip -------------------------------------------------------------------------------- /app_utils/strictyaml.zip: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/app_utils/strictyaml.zip -------------------------------------------------------------------------------- /artifacts/customers.yml: -------------------------------------------------------------------------------- 1 | models: 2 | - name: customers 3 | description: Customer overview data mart, offering key details for each unique customer. One row per customer. 4 | data_tests: 5 | - dbt_utils.expression_is_true: 6 | expression: "lifetime_spend_pretax + lifetime_tax_paid = lifetime_spend" 7 | columns: 8 | - name: customer_id 9 | description: The unique key of the orders mart. 10 | data_tests: 11 | - not_null 12 | - unique 13 | - name: customer_name 14 | description: Customers' full name. 15 | - name: count_lifetime_orders 16 | description: Total number of orders a customer has ever placed. 17 | - name: first_ordered_at 18 | description: The timestamp when a customer placed their first order. 19 | - name: last_ordered_at 20 | description: The timestamp of a customer's most recent order. 21 | - name: lifetime_spend_pretax 22 | description: The sum of all the pre-tax subtotals of every order a customer has placed. 23 | - name: lifetime_tax_paid 24 | description: The sum of all the tax portion of every order a customer has placed. 25 | - name: lifetime_spend 26 | description: The sum of all the order totals (including tax) that a customer has ever placed. 27 | - name: customer_type 28 | description: Options are 'new' or 'returning', indicating if a customer has ordered more than once or has only placed their first order to date. 29 | data_tests: 30 | - accepted_values: 31 | values: ["new", "returning"] 32 | 33 | semantic_models: 34 | - name: customers 35 | defaults: 36 | agg_time_dimension: first_ordered_at 37 | description: | 38 | Customer grain mart. 39 | model: ref('customers') 40 | entities: 41 | - name: customer 42 | expr: customer_id 43 | type: primary 44 | dimensions: 45 | - name: customer_name 46 | type: categorical 47 | - name: customer_type 48 | type: categorical 49 | - name: first_ordered_at 50 | type: time 51 | type_params: 52 | time_granularity: day 53 | - name: last_ordered_at 54 | type: time 55 | type_params: 56 | time_granularity: day 57 | measures: 58 | - name: count_lifetime_orders 59 | description: Total count of orders per customer. 60 | agg: sum 61 | - name: lifetime_spend_pretax 62 | description: Customer lifetime spend before taxes. 63 | agg: sum 64 | - name: lifetime_spend 65 | agg: sum 66 | description: Gross customer lifetime spend inclusive of taxes. 67 | 68 | metrics: 69 | - name: lifetime_spend_pretax 70 | description: Customer's lifetime spend before tax 71 | label: LTV Pre-tax 72 | type: simple 73 | type_params: 74 | measure: lifetime_spend_pretax 75 | - name: count_lifetime_orders 76 | description: Count of lifetime orders 77 | label: Count Lifetime Orders 78 | type: simple 79 | type_params: 80 | measure: count_lifetime_orders 81 | - name: average_order_value 82 | description: LTV pre-tax / number of orders 83 | label: Average Order Value 84 | type: derived 85 | type_params: 86 | metrics: 87 | - count_lifetime_orders 88 | - lifetime_spend_pretax 89 | expr: lifetime_spend_pretax / count_lifetime_orders 90 | 91 | saved_queries: 92 | - name: customer_order_metrics 93 | query_params: 94 | metrics: 95 | - count_lifetime_orders 96 | - lifetime_spend_pretax 97 | - average_order_value 98 | group_by: 99 | - Entity('customer') 100 | exports: 101 | - name: customer_order_metrics 102 | config: 103 | export_as: table 104 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: sf_env 2 | channels: 3 | - snowflake 4 | dependencies: 5 | - python=3.10.* 6 | - pandas=2.2.2 7 | - tqdm=4.66.5 8 | - streamlit=1.35.0 9 | - loguru=0.5.3 10 | - protobuf=3.20.3 11 | - pydantic=2.8.2 12 | - pyyaml=6.0.1 13 | - ruamel.yaml=0.17.21 14 | - pyarrow=14.0.2 15 | - sqlglot=25.10.0 16 | - numpy=1.26.4 17 | - python-dotenv=0.21.0 18 | - urllib3=2.2.2 19 | - requests=2.32.3 20 | - types-pyyaml=6.0.12.12 21 | - types-protobuf=4.25.0.20240417 22 | - snowflake-snowpark-python=1.18.0 23 | - streamlit-extras=0.4.0 24 | - cattrs=23.1.2 -------------------------------------------------------------------------------- /images/dbt-signature_tm_black.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/images/dbt-signature_tm_black.png -------------------------------------------------------------------------------- /images/error39.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/images/error39.png -------------------------------------------------------------------------------- /images/looker.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/images/looker.png -------------------------------------------------------------------------------- /journeys/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/journeys/__init__.py -------------------------------------------------------------------------------- /journeys/builder.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | from loguru import logger 3 | from snowflake.connector import ProgrammingError 4 | 5 | from app_utils.shared_utils import ( 6 | GeneratorAppScreen, 7 | format_snowflake_context, 8 | get_available_databases, 9 | get_available_schemas, 10 | get_available_tables, 11 | input_sample_value_num, 12 | input_semantic_file_name, 13 | run_generate_model_str_from_snowflake, 14 | ) 15 | 16 | 17 | def update_schemas_and_tables() -> None: 18 | """ 19 | Callback to run when the selected databases change. Ensures that if a database is deselected, the corresponding 20 | schemas and tables are also deselected. 21 | Returns: None 22 | 23 | """ 24 | databases = st.session_state["selected_databases"] 25 | 26 | # Fetch the available schemas for the selected databases 27 | schemas = [] 28 | for db in databases: 29 | try: 30 | schemas.extend(get_available_schemas(db)) 31 | except ProgrammingError: 32 | logger.info( 33 | f"Insufficient permissions to read from database {db}, skipping" 34 | ) 35 | 36 | st.session_state["available_schemas"] = schemas 37 | 38 | # Enforce that the previously selected schemas are still valid 39 | valid_selected_schemas = [ 40 | schema for schema in st.session_state["selected_schemas"] if schema in schemas 41 | ] 42 | st.session_state["selected_schemas"] = valid_selected_schemas 43 | update_tables() 44 | 45 | 46 | def update_tables() -> None: 47 | """ 48 | Callback to run when the selected schemas change. Ensures that if a schema is deselected, the corresponding 49 | tables are also deselected. 50 | """ 51 | schemas = st.session_state["selected_schemas"] 52 | 53 | # Fetch the available tables for the selected schemas 54 | tables = [] 55 | for schema in schemas: 56 | try: 57 | tables.extend(get_available_tables(schema)) 58 | except ProgrammingError: 59 | logger.info( 60 | f"Insufficient permissions to read from schema {schema}, skipping" 61 | ) 62 | st.session_state["available_tables"] = tables 63 | 64 | # Enforce that the previously selected tables are still valid 65 | valid_selected_tables = [ 66 | table for table in st.session_state["selected_tables"] if table in tables 67 | ] 68 | st.session_state["selected_tables"] = valid_selected_tables 69 | 70 | 71 | @st.experimental_dialog("Selecting your tables", width="large") 72 | def table_selector_dialog() -> None: 73 | st.write( 74 | "Please fill out the following fields to start building your semantic model." 75 | ) 76 | model_name = input_semantic_file_name() 77 | sample_values = input_sample_value_num() 78 | st.markdown("") 79 | 80 | if "selected_databases" not in st.session_state: 81 | st.session_state["selected_databases"] = [] 82 | 83 | if "selected_schemas" not in st.session_state: 84 | st.session_state["selected_schemas"] = [] 85 | 86 | if "selected_tables" not in st.session_state: 87 | st.session_state["selected_tables"] = [] 88 | 89 | with st.spinner("Loading databases..."): 90 | available_databases = get_available_databases() 91 | 92 | st.multiselect( 93 | label="Databases", 94 | options=available_databases, 95 | placeholder="Select the databases that contain the tables you'd like to include in your semantic model.", 96 | on_change=update_schemas_and_tables, 97 | key="selected_databases", 98 | # default=st.session_state.get("selected_databases", []), 99 | ) 100 | 101 | st.multiselect( 102 | label="Schemas", 103 | options=st.session_state.get("available_schemas", []), 104 | placeholder="Select the schemas that contain the tables you'd like to include in your semantic model.", 105 | on_change=update_tables, 106 | key="selected_schemas", 107 | format_func=lambda x: format_snowflake_context(x, -1), 108 | ) 109 | 110 | st.multiselect( 111 | label="Tables", 112 | options=st.session_state.get("available_tables", []), 113 | placeholder="Select the tables you'd like to include in your semantic model.", 114 | key="selected_tables", 115 | format_func=lambda x: format_snowflake_context(x, -1), 116 | ) 117 | 118 | st.markdown("
", unsafe_allow_html=True) 119 | experimental_features = st.checkbox( 120 | "Enable joins (optional)", 121 | help="Checking this box will enable you to add/edit join paths in your semantic model. If enabling this setting, please ensure that you have the proper parameters set on your Snowflake account. Reach out to your account team for access.", 122 | ) 123 | 124 | st.session_state["experimental_features"] = experimental_features 125 | 126 | submit = st.button("Submit", use_container_width=True, type="primary") 127 | if submit: 128 | try: 129 | run_generate_model_str_from_snowflake( 130 | model_name, 131 | sample_values, 132 | st.session_state["selected_tables"], 133 | allow_joins=experimental_features, 134 | ) 135 | st.session_state["page"] = GeneratorAppScreen.ITERATION 136 | st.rerun() 137 | except ValueError as e: 138 | st.error(e) 139 | 140 | 141 | def show() -> None: 142 | table_selector_dialog() 143 | -------------------------------------------------------------------------------- /journeys/joins.py: -------------------------------------------------------------------------------- 1 | from typing import Optional 2 | 3 | import streamlit as st 4 | from streamlit_extras.row import row 5 | 6 | from app_utils.shared_utils import get_snowflake_connection 7 | from semantic_model_generator.data_processing.cte_utils import ( 8 | fully_qualified_table_name, 9 | ) 10 | from semantic_model_generator.protos import semantic_model_pb2 11 | from semantic_model_generator.snowflake_utils.snowflake_connector import ( 12 | get_table_primary_keys, 13 | ) 14 | 15 | SUPPORTED_JOIN_TYPES = [ 16 | join_type 17 | for join_type in semantic_model_pb2.JoinType.values() 18 | if join_type != semantic_model_pb2.JoinType.join_type_unknown 19 | ] 20 | SUPPORTED_RELATIONSHIP_TYPES = [ 21 | relationship_type 22 | for relationship_type in semantic_model_pb2.RelationshipType.values() 23 | if relationship_type 24 | != semantic_model_pb2.RelationshipType.relationship_type_unknown 25 | ] 26 | 27 | 28 | def relationship_builder( 29 | relationship: semantic_model_pb2.Relationship, key: Optional[int] = 0 30 | ) -> None: 31 | """ 32 | Renders a UI for building/editing a semantic model relationship. 33 | Args: 34 | relationship: The relationship object to edit. 35 | 36 | Returns: 37 | 38 | """ 39 | with st.expander( 40 | relationship.name or f"{relationship.left_table} ↔️ {relationship.right_table}", 41 | expanded=True, 42 | ): 43 | relationship.name = st.text_input( 44 | "Name", value=relationship.name, key=f"name_{key}" 45 | ) 46 | # Logic to preselect the tables in the dropdown based on what's in the semantic model. 47 | try: 48 | default_left_table = [ 49 | table.name for table in st.session_state.semantic_model.tables 50 | ].index(relationship.left_table) 51 | default_right_table = [ 52 | table.name for table in st.session_state.semantic_model.tables 53 | ].index(relationship.right_table) 54 | except ValueError: 55 | default_left_table = 0 56 | default_right_table = 0 57 | relationship.left_table = st.selectbox( 58 | "Left Table", 59 | options=[table.name for table in st.session_state.semantic_model.tables], 60 | index=default_left_table, 61 | key=f"left_table_{key}", 62 | ) 63 | 64 | relationship.right_table = st.selectbox( 65 | "Right Table", 66 | options=[table.name for table in st.session_state.semantic_model.tables], 67 | index=default_right_table, 68 | key=f"right_table_{key}", 69 | ) 70 | 71 | relationship.join_type = st.radio( # type: ignore 72 | "Join Type", 73 | options=SUPPORTED_JOIN_TYPES, 74 | format_func=lambda join_type: semantic_model_pb2.JoinType.Name(join_type), 75 | index=SUPPORTED_JOIN_TYPES.index(relationship.join_type), 76 | key=f"join_type_{key}", 77 | ) 78 | 79 | relationship.relationship_type = st.radio( # type: ignore 80 | "Relationship Type", 81 | options=SUPPORTED_RELATIONSHIP_TYPES, 82 | format_func=lambda relationship_type: semantic_model_pb2.RelationshipType.Name( 83 | relationship_type 84 | ), 85 | index=SUPPORTED_RELATIONSHIP_TYPES.index(relationship.relationship_type), 86 | key=f"relationship_type_{key}", 87 | ) 88 | 89 | st.divider() 90 | # Builder section for the relationship's columns. 91 | for col_idx, join_cols in enumerate(relationship.relationship_columns): 92 | # Grabbing references to the exact Table objects that the relationship is pointing to. 93 | # This allows us to pull the columns. 94 | left_table_object = next( 95 | ( 96 | table 97 | for table in st.session_state.semantic_model.tables 98 | if table.name == relationship.left_table 99 | ) 100 | ) 101 | right_table_object = next( 102 | ( 103 | table 104 | for table in st.session_state.semantic_model.tables 105 | if table.name == relationship.right_table 106 | ) 107 | ) 108 | 109 | try: 110 | left_columns = [] 111 | left_columns.extend(left_table_object.columns) 112 | left_columns.extend(left_table_object.dimensions) 113 | left_columns.extend(left_table_object.time_dimensions) 114 | left_columns.extend(left_table_object.measures) 115 | 116 | right_columns = [] 117 | right_columns.extend(right_table_object.columns) 118 | right_columns.extend(right_table_object.dimensions) 119 | right_columns.extend(right_table_object.time_dimensions) 120 | right_columns.extend(right_table_object.measures) 121 | 122 | default_left_col = [col.name for col in left_columns].index( 123 | join_cols.left_column 124 | ) 125 | default_right_col = [col.name for col in right_columns].index( 126 | join_cols.right_column 127 | ) 128 | except ValueError: 129 | default_left_col = 0 130 | default_right_col = 0 131 | 132 | join_cols.left_column = st.selectbox( 133 | "Left Column", 134 | options=[col.name for col in left_columns], 135 | index=default_left_col, 136 | key=f"left_col_{key}_{col_idx}", 137 | ) 138 | join_cols.right_column = st.selectbox( 139 | "Right Column", 140 | options=[col.name for col in right_columns], 141 | index=default_right_col, 142 | key=f"right_col_{key}_{col_idx}", 143 | ) 144 | 145 | if st.button("Delete join key", key=f"delete_join_key_{key}_{col_idx}"): 146 | relationship.relationship_columns.pop(col_idx) 147 | st.rerun() 148 | 149 | st.divider() 150 | 151 | join_editor_row = row(2, vertical_align="center") 152 | if join_editor_row.button( 153 | "Add new join key", 154 | key=f"add_join_keys_{key}", 155 | use_container_width=True, 156 | type="primary", 157 | ): 158 | relationship.relationship_columns.append( 159 | semantic_model_pb2.RelationKey( 160 | left_column="", 161 | right_column="", 162 | ) 163 | ) 164 | st.rerun() 165 | 166 | if join_editor_row.button( 167 | "🗑️ Delete join path", 168 | key=f"delete_join_path_{key}", 169 | use_container_width=True, 170 | ): 171 | st.session_state.builder_joins.pop(key) 172 | st.rerun() 173 | 174 | 175 | @st.experimental_dialog("Join Builder", width="large") 176 | def joins_dialog() -> None: 177 | if "builder_joins" not in st.session_state: 178 | # Making a copy of the original relationships list so we can modify freely without affecting the original. 179 | st.session_state.builder_joins = st.session_state.semantic_model.relationships[ 180 | : 181 | ] 182 | 183 | for idx, relationship in enumerate(st.session_state.builder_joins): 184 | relationship_builder(relationship, idx) 185 | 186 | # If the user clicks "Add join", add a new join to the relationships list 187 | if st.button("Add new join path", use_container_width=True): 188 | st.session_state.builder_joins.append( 189 | semantic_model_pb2.Relationship( 190 | left_table="", 191 | right_table="", 192 | join_type=semantic_model_pb2.JoinType.inner, 193 | relationship_type=semantic_model_pb2.RelationshipType.one_to_one, 194 | relationship_columns=[], 195 | ) 196 | ) 197 | st.rerun() 198 | 199 | # If the user clicks "Save", save the relationships list to the session state 200 | if st.button("Save to semantic model", use_container_width=True, type="primary"): 201 | # Quickly validate that all of the user's joins have the required fields. 202 | for relationship in st.session_state.builder_joins: 203 | if not relationship.left_table or not relationship.right_table: 204 | st.error("Please fill out left and right tables for all join paths.") 205 | return 206 | 207 | if not relationship.name: 208 | st.error( 209 | f"The join path between {relationship.left_table} and {relationship.right_table} is missing a name." 210 | ) 211 | return 212 | 213 | if not relationship.relationship_columns: 214 | st.error( 215 | f"The join path between {relationship.left_table} and {relationship.right_table} is missing joinable columns." 216 | ) 217 | return 218 | 219 | # Populate primary key information for each table in a join relationship. 220 | left_table_object = next( 221 | ( 222 | table 223 | for table in st.session_state.semantic_model.tables 224 | if table.name == relationship.left_table 225 | ) 226 | ) 227 | right_table_object = next( 228 | ( 229 | table 230 | for table in st.session_state.semantic_model.tables 231 | if table.name == relationship.right_table 232 | ) 233 | ) 234 | 235 | with st.spinner("Fetching primary keys..."): 236 | if not left_table_object.primary_key.columns: 237 | primary_keys = get_table_primary_keys( 238 | get_snowflake_connection(), 239 | table_fqn=fully_qualified_table_name( 240 | left_table_object.base_table 241 | ), 242 | ) 243 | left_table_object.primary_key.columns.extend(primary_keys or [""]) 244 | 245 | if not right_table_object.primary_key.columns: 246 | primary_keys = get_table_primary_keys( 247 | get_snowflake_connection(), 248 | table_fqn=fully_qualified_table_name( 249 | right_table_object.base_table 250 | ), 251 | ) 252 | right_table_object.primary_key.columns.extend(primary_keys or [""]) 253 | 254 | del st.session_state.semantic_model.relationships[:] 255 | st.session_state.semantic_model.relationships.extend( 256 | st.session_state.builder_joins 257 | ) 258 | st.session_state.validated = None 259 | st.session_state["join_dialog_open"] = False 260 | st.rerun() 261 | -------------------------------------------------------------------------------- /journeys/partner.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | 3 | 4 | @st.experimental_dialog("Partner Semantic Support", width="large") 5 | def partner_semantic_setup() -> None: 6 | """ 7 | Renders the partner semantic setup dialog with instructions. 8 | """ 9 | from partner.partner_utils import configure_partner_semantic 10 | 11 | st.write( 12 | """ 13 | Have an existing semantic layer in a partner tool that's integrated with Snowflake? 14 | See the below instructions for integrating your partner semantic specs into Cortex Analyst's semantic file. 15 | """ 16 | ) 17 | configure_partner_semantic() 18 | 19 | 20 | def show() -> None: 21 | """ 22 | Runs partner setup dialog. 23 | """ 24 | partner_semantic_setup() 25 | -------------------------------------------------------------------------------- /mypy.ini: -------------------------------------------------------------------------------- 1 | [mypy] 2 | plugins = pydantic.mypy 3 | 4 | ignore_missing_imports = True 5 | strict = True 6 | disallow_untyped_defs = True 7 | warn_unused_ignores = False 8 | disallow_any_generics = True 9 | 10 | exclude = .venv|venv|pyvenv|(_test\.py|test_.*\.py)|_pb2\.py|_pb2\.pyi|admin_app/streamlit_app.py 11 | 12 | [mypy-semantic_model_generator.protos.semantic_model_pb2] 13 | ignore_errors = True 14 | 15 | [mypy-requests] 16 | ignore_missing_imports = True 17 | -------------------------------------------------------------------------------- /partner/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/partner/__init__.py -------------------------------------------------------------------------------- /partner/cortex.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | 6 | from semantic_model_generator.data_processing.proto_utils import ( 7 | proto_to_dict, 8 | yaml_to_semantic_model, 9 | ) 10 | 11 | 12 | class CortexDimension: 13 | """ 14 | Class for Cortex dimension-type field. 15 | """ 16 | 17 | def __init__(self, data: dict[str, Any]): 18 | 19 | self.data: dict[str, Any] = data 20 | self.name: str = data["name"] 21 | self.synonyms: Optional[list[str]] = data.get("synonyms", None) 22 | self.data_type: str = data.get("data_type", "TEXT") 23 | self.expr: str = data["expr"] 24 | self.description: Optional[str] = data.get("description", None) 25 | self.sample_values: Optional[list[str]] = data.get("sample_values", None) 26 | self.unique: bool = data.get("unique", False) 27 | 28 | def get_name(self) -> str: 29 | return self.name 30 | 31 | def get_data(self) -> dict[str, Any]: 32 | return self.data 33 | 34 | def get_cortex_type(self) -> str: 35 | return self.data_type 36 | 37 | def get_description(self) -> Optional[str]: 38 | return self.description 39 | 40 | def set_description(self, value: str) -> None: 41 | self.description = value 42 | 43 | def get_cortex_section(self) -> str: 44 | return "dimensions" 45 | 46 | def get_key(self) -> str: 47 | return self.expr.upper() 48 | 49 | def get_cortex_details(self) -> dict[str, Any]: 50 | """ 51 | Used in static methods in partner classes to retrieve and modify Cortex-equivalent details 52 | """ 53 | return self.data 54 | 55 | def get_cortex_comparison_dict(self) -> dict[str, Any]: 56 | return { 57 | "field_key": self.get_key(), 58 | "section": self.get_cortex_section(), 59 | "field_details": self.get_cortex_details(), 60 | } 61 | 62 | 63 | class CortexTimeDimension(CortexDimension): 64 | """ 65 | Class for Cortex time dimension-type field. 66 | """ 67 | 68 | def get_cortex_section(self) -> str: 69 | return "time_dimensions" 70 | 71 | 72 | class CortexMeasure(CortexDimension): 73 | """ 74 | Class for Cortex measure-type field. 75 | """ 76 | 77 | def __init__(self, data: dict[str, Any]): 78 | super().__init__(data) 79 | self.default_aggregation = data.get("default_aggregation", None) 80 | 81 | def get_cortex_section(self) -> str: 82 | return "measures" 83 | 84 | 85 | class CortexSemanticTable: 86 | """ 87 | Class for single Cortex logical table in semantic file. 88 | """ 89 | 90 | def __init__(self, data: dict[str, Any]): 91 | self.data: dict[str, Any] = data 92 | self.name: str = data["name"] 93 | self.description: Optional[str] = data["description"] 94 | self.base_table_db: str = data["base_table"]["database"] 95 | self.base_table_schema: str = data["base_table"]["schema"] 96 | self.base_table_table: str = data["base_table"]["table"] 97 | self.dimensions: Optional[list[dict[str, Any]]] = data["dimensions"] 98 | self.time_dimensions: Optional[list[dict[str, Any]]] = data["time_dimensions"] 99 | self.measures: Optional[list[dict[str, Any]]] = data["measures"] 100 | 101 | def get_data(self) -> dict[str, Any]: 102 | return self.data 103 | 104 | def get_name(self) -> str: 105 | return self.name 106 | 107 | def get_description(self) -> Optional[str]: 108 | return self.description 109 | 110 | def get_cortex_fields(self) -> list[dict[str, Any]]: 111 | """ 112 | Processes and returns raw field data as vendor-specific field objects. 113 | """ 114 | 115 | cortex_fields = [] 116 | if self.dimensions: 117 | for dimension in self.dimensions: 118 | cortex_fields.append( 119 | CortexDimension(dimension).get_cortex_comparison_dict() 120 | ) 121 | if self.time_dimensions: 122 | for time_dimension in self.time_dimensions: 123 | cortex_fields.append( 124 | CortexTimeDimension(time_dimension).get_cortex_comparison_dict() 125 | ) 126 | if self.measures: 127 | for measure in self.measures: 128 | cortex_fields.append( 129 | CortexMeasure(measure).get_cortex_comparison_dict() 130 | ) 131 | 132 | return cortex_fields 133 | 134 | def create_comparison_df(self) -> pd.DataFrame: 135 | cortex_fields = self.get_cortex_fields() 136 | return pd.DataFrame(cortex_fields) 137 | 138 | @staticmethod 139 | def create_cortex_table_list() -> None: 140 | cortex_semantic = proto_to_dict( 141 | yaml_to_semantic_model(st.session_state["last_saved_yaml"]) 142 | ) 143 | # Need to replace table details in current entire yaml 144 | st.session_state["current_yaml_as_dict"] = cortex_semantic 145 | tables = [] 146 | for table in cortex_semantic["tables"]: 147 | tables.append(CortexSemanticTable(table)) 148 | st.session_state["cortex_comparison_tables"] = tables 149 | 150 | @staticmethod 151 | def retrieve_df_by_name(name: str) -> pd.DataFrame: 152 | for table in st.session_state["cortex_comparison_tables"]: 153 | if table.get_name() == name: 154 | return table.create_comparison_df() 155 | -------------------------------------------------------------------------------- /partner/dbt.py: -------------------------------------------------------------------------------- 1 | from typing import Any, Optional, Union 2 | 3 | import pandas as pd 4 | import streamlit as st 5 | import yaml 6 | from snowflake.connector import ProgrammingError 7 | 8 | from app_utils.shared_utils import ( 9 | download_yaml, 10 | get_snowflake_connection, 11 | get_yamls_from_stage, 12 | set_sit_query_tag, 13 | stage_selector_container, 14 | ) 15 | 16 | # Partner semantic support instructions 17 | DBT_IMAGE = "images/dbt-signature_tm_black.png" 18 | DBT_MODEL_INSTRUCTIONS = """ 19 | ### [SQL Model](https://docs.getdbt.com/docs/build/sql-models) 20 | 21 | Materialize your SQL model(s) as Snowflake table(s) and generate a Cortex Analyst semantic file for them directly. 22 | > Steps: 23 | > 1) Update dbt model(s) to be [materialized](https://docs.getdbt.com/docs/build/materializations) in Snowflake. 24 | > 2) Update dbt model(s) to [persist docs](https://docs.getdbt.com/reference/resource-configs/persist_docs) to capture table/column descriptions. 25 | > 3) Run dbt model(s) to materialize in Snowflake. 26 | > 4) Select **🛠 Create a new semantic model** on the homepage and select the materialized Snowflake table(s). 27 | """ 28 | DBT_SEMANTIC_INSTRUCTIONS = """ 29 | ### [Semantic Model](https://docs.getdbt.com/docs/build/semantic-models) 30 | 31 | We extract metadata from your dbt semantic yaml file(s) and merge it with a generated Cortex Analyst semantic file. 32 | 33 | **Note**: The DBT semantic layer must be sourced from tables/views in Snowflake. 34 | If using Streamlit in Snowflake, upload dbt semantic (yaml/yml) file(s) to Snowflake stage first. 35 | 36 | > Steps: 37 | > 1) Select your dbt semantic (yaml/yml) file(s) below from stage or upload directly if not using Streamlit in Snowflake. 38 | > 2) Select **🛠 Create a new semantic model** to generate a new Cortex Analyst semantic file for Snowflake tables or **✏️ Edit an existing semantic model**. 39 | > 3) Validate the output in the UI. 40 | > 4) Once you've validated the semantic file, click **Partner Semantic** to merge DBT and Cortex Analyst semantic files. 41 | """ 42 | 43 | 44 | def upload_dbt_semantic() -> None: 45 | """ 46 | Upload semantic file(s) for dbt from local source. 47 | 48 | Returns: None 49 | """ 50 | uploaded_files = [] 51 | if st.session_state["sis"]: 52 | stage_selector_container() 53 | # Based on the currently selected stage, show a dropdown of YAML files for the user to pick from. 54 | available_files = [] 55 | if ( 56 | "selected_iteration_stage" in st.session_state 57 | and st.session_state["selected_iteration_stage"] 58 | ): 59 | try: 60 | available_files = get_yamls_from_stage( 61 | st.session_state["selected_iteration_stage"], 62 | include_yml=True, 63 | ) 64 | except (ValueError, ProgrammingError): 65 | st.error("Insufficient permissions to read from the selected stage.") 66 | st.stop() 67 | 68 | stage_files = st.multiselect("Staged files", options=available_files) 69 | if stage_files: 70 | for staged_file in stage_files: 71 | file_content = download_yaml( 72 | staged_file, st.session_state["selected_iteration_stage"] 73 | ) 74 | uploaded_files.append(file_content) 75 | else: 76 | uploaded_files = st.file_uploader( # type: ignore 77 | f'Upload {st.session_state["partner_tool"]} semantic yaml file(s)', 78 | type=["yaml", "yml"], 79 | accept_multiple_files=True, 80 | key="dbt_files", 81 | ) 82 | if uploaded_files: 83 | partner_semantic: list[Union[None, DBTSemanticModel]] = [] 84 | for file in uploaded_files: 85 | partner_semantic.extend(read_dbt_yaml(file)) # type: ignore 86 | 87 | if not partner_semantic: 88 | st.error("Upload file(s) do not contain required semantic_models section.") 89 | else: 90 | st.session_state["partner_semantic"] = partner_semantic 91 | if st.button("Continue", type="primary"): 92 | st.session_state["partner_setup"] = True 93 | set_sit_query_tag( 94 | get_snowflake_connection(), 95 | vendor="dbt", 96 | action="setup_complete", 97 | ) 98 | st.rerun() 99 | else: 100 | st.session_state["partner_semantic"] = None 101 | 102 | 103 | class DBTEntity: 104 | """ 105 | Class for dbt entity-type field. 106 | """ 107 | 108 | def __init__(self, entity: dict[str, Any]): 109 | 110 | self.entity: dict[str, Any] = entity 111 | self.name: str = entity["name"] 112 | self.type: str = entity.get("type", None) 113 | self.expr: str = entity.get("expr", self.name) 114 | self.description: Optional[str] = entity.get("description", None) 115 | self.cortex_map = { 116 | "name": self.name, 117 | "description": self.description, 118 | "expr": self.expr, 119 | "data_type": self.get_cortex_type(), 120 | } 121 | 122 | def get_data(self) -> dict[str, Any]: 123 | return self.entity 124 | 125 | def get_cortex_type(self) -> str: 126 | return "TEXT" 127 | 128 | def get_cortex_section(self) -> str: 129 | return "dimensions" 130 | 131 | def get_key(self) -> str: 132 | return self.expr.upper() 133 | 134 | def get_cortex_details(self) -> dict[str, Any]: 135 | return_details = {} 136 | for k, v in self.cortex_map.items(): 137 | if v is not None: 138 | return_details[k] = v 139 | return return_details 140 | 141 | def get_cortex_comparison_dict(self) -> dict[str, Any]: 142 | return { 143 | "field_key": self.get_key(), 144 | "section": self.get_cortex_section(), 145 | "field_details": self.get_cortex_details(), 146 | } 147 | 148 | 149 | class DBTMeasure(DBTEntity): 150 | """ 151 | Class for dbt measure-type field. 152 | """ 153 | 154 | def __init__(self, entity: dict[str, Any]): 155 | super().__init__(entity) 156 | self.agg: Optional[str] = entity.get("agg", None) 157 | self.cortex_map = { 158 | "name": self.name, 159 | "description": self.description, 160 | "expr": self.expr, 161 | "data_type": self.get_cortex_type(), 162 | "default_aggregation": self.agg, 163 | } 164 | 165 | def get_cortex_type(self) -> str: 166 | return "NUMBER" 167 | 168 | def get_cortex_section(self) -> str: 169 | return "measures" 170 | 171 | 172 | class DBTDimension(DBTEntity): 173 | """ 174 | Class for dbt dimension-type field. 175 | """ 176 | 177 | def get_cortex_type(self) -> str: 178 | if self.type == "time": 179 | return "DATETIME" 180 | else: 181 | return "TEXT" 182 | 183 | def get_cortex_section(self) -> str: 184 | if self.type == "time": 185 | return "time_dimensions" 186 | else: 187 | return "dimensions" 188 | 189 | 190 | class DBTSemanticModel: 191 | """ 192 | Class for single DBT semantic model. 193 | """ 194 | 195 | def __init__(self, data: dict[str, Any]): 196 | self.data: dict[str, Any] = data 197 | self.name: str = data["name"] 198 | self.description: Optional[str] = data.get("description", None) 199 | self.entities: Optional[list[dict[str, Any]]] = data["entities"] 200 | self.dimensions: Optional[list[dict[str, Any]]] = data["dimensions"] 201 | self.measures: Optional[list[dict[str, Any]]] = data["measures"] 202 | 203 | def get_data(self) -> dict[str, Any]: 204 | return self.data 205 | 206 | def get_name(self) -> str: 207 | return self.name 208 | 209 | def get_description(self) -> Optional[str]: 210 | return self.description 211 | 212 | def get_cortex_fields(self) -> list[dict[str, Any]]: 213 | cortex_fields = [] 214 | if self.entities: 215 | for entity in self.entities: 216 | cortex_fields.append(DBTEntity(entity).get_cortex_comparison_dict()) 217 | if self.measures: 218 | for measure in self.measures: 219 | cortex_fields.append(DBTMeasure(measure).get_cortex_comparison_dict()) 220 | if self.dimensions: 221 | for dimension in self.dimensions: 222 | cortex_fields.append( 223 | DBTDimension(dimension).get_cortex_comparison_dict() 224 | ) 225 | 226 | return cortex_fields 227 | 228 | def create_comparison_df(self) -> pd.DataFrame: 229 | cortex_fields = self.get_cortex_fields() 230 | return pd.DataFrame(cortex_fields) 231 | 232 | @staticmethod 233 | def retrieve_df_by_name(name: str) -> pd.DataFrame: 234 | for model in st.session_state["partner_semantic"]: 235 | if model.get_name() == name: 236 | return model.create_comparison_df() 237 | 238 | 239 | def read_dbt_yaml(file_path: str) -> list[DBTSemanticModel]: 240 | """ 241 | Reads file uploads and extracts dbt semantic files in list. 242 | Args: 243 | file_path (str): Local file path uploaded by user. 244 | 245 | Returns: None | list[DBTSemanticModel] 246 | """ 247 | 248 | data = yaml.safe_load(file_path) 249 | dbt_semantic_models = [] 250 | if "semantic_models" in data: 251 | # dbt_semantic_models = [] 252 | for semantic_model in data["semantic_models"]: 253 | dbt_semantic_models.append(DBTSemanticModel(semantic_model)) 254 | else: 255 | st.warning(f"{file_path} does not contain semantic_models section. Skipping.") 256 | return dbt_semantic_models 257 | -------------------------------------------------------------------------------- /partner/partner_utils.py: -------------------------------------------------------------------------------- 1 | import json 2 | import time 3 | from enum import Enum 4 | from typing import Any, Union 5 | 6 | import numpy as np 7 | import pandas as pd 8 | import streamlit as st 9 | import yaml 10 | 11 | from app_utils.shared_utils import ( 12 | get_snowflake_connection, 13 | render_image, 14 | set_sit_query_tag, 15 | ) 16 | from partner.cortex import CortexSemanticTable 17 | from partner.dbt import DBTSemanticModel, upload_dbt_semantic 18 | from semantic_model_generator.data_processing.proto_utils import yaml_to_semantic_model 19 | 20 | 21 | class PartnerTool(Enum): 22 | DBT_SQL_MODEL = "dbt - SQL Model" 23 | DBT_SEMANTIC_MODEL = "dbt - Semantic Model" 24 | LOOKER_EXPLORE = "Looker - Explore" 25 | 26 | 27 | def set_partner_instructions() -> None: 28 | """ 29 | Sets instructions and partner logo in session_state based on selected partner. 30 | Returns: None 31 | """ 32 | 33 | if st.session_state.get("partner_tool", None): 34 | if st.session_state["partner_tool"] == PartnerTool.DBT_SQL_MODEL.value: 35 | from partner.dbt import DBT_IMAGE, DBT_MODEL_INSTRUCTIONS 36 | 37 | instructions = DBT_MODEL_INSTRUCTIONS 38 | image = DBT_IMAGE 39 | image_size = (72, 32) 40 | elif st.session_state["partner_tool"] == PartnerTool.DBT_SEMANTIC_MODEL.value: 41 | from partner.dbt import DBT_IMAGE, DBT_SEMANTIC_INSTRUCTIONS 42 | 43 | instructions = DBT_SEMANTIC_INSTRUCTIONS 44 | image = DBT_IMAGE 45 | image_size = (72, 32) 46 | elif st.session_state["partner_tool"] == PartnerTool.LOOKER_EXPLORE.value: 47 | from partner.looker import LOOKER_IMAGE, LOOKER_INSTRUCTIONS 48 | 49 | instructions = LOOKER_INSTRUCTIONS 50 | image = LOOKER_IMAGE 51 | image_size = (72, 72) 52 | st.session_state["partner_instructions"] = instructions 53 | st.session_state["partner_image"] = image 54 | st.session_state["partner_image_size"] = image_size 55 | 56 | 57 | def configure_partner_semantic() -> None: 58 | """ 59 | Upload semantic files from local source. 60 | Returns: None 61 | """ 62 | 63 | partners = [tool.value for tool in PartnerTool] 64 | 65 | st.selectbox( 66 | "Select the partner tool", 67 | partners, 68 | index=None, 69 | key="partner_tool", 70 | on_change=set_partner_instructions(), # type: ignore 71 | ) 72 | if st.session_state.get("partner_tool", None): 73 | with st.expander( 74 | "Instructions", 75 | expanded=True, 76 | ): 77 | render_image( 78 | st.session_state["partner_image"], 79 | st.session_state["partner_image_size"], 80 | ) 81 | st.write(st.session_state["partner_instructions"]) 82 | 83 | # Previous dialog box widget values will reset when overlayed 84 | if st.session_state.get("partner_tool", None): 85 | st.session_state["selected_partner"] = st.session_state["partner_tool"] 86 | 87 | if st.session_state["partner_tool"] == PartnerTool.DBT_SEMANTIC_MODEL.value: 88 | upload_dbt_semantic() 89 | if st.session_state["partner_tool"] == PartnerTool.LOOKER_EXPLORE.value: 90 | from partner.looker import set_looker_semantic 91 | 92 | set_looker_semantic() 93 | if st.session_state["partner_tool"] == PartnerTool.DBT_SQL_MODEL.value: 94 | st.session_state["partner_setup"] = False 95 | 96 | 97 | class PartnerCompareRow: 98 | """ 99 | Renders matched and unmatched cortex and partner fields for comparison. 100 | """ 101 | 102 | def __init__(self, row_data: pd.Series) -> None: # type: ignore 103 | self.row_data = row_data 104 | self.key = row_data["field_key"] 105 | self.cortex_metadata = ( 106 | self.row_data["field_details_cortex"] 107 | if self.row_data["field_details_cortex"] 108 | else {} 109 | ) 110 | self.partner_metadata = ( 111 | self.row_data["field_details_partner"] 112 | if self.row_data["field_details_partner"] 113 | else {} 114 | ) 115 | 116 | def render_row(self) -> Union[None, dict[str, Any]]: # type: ignore 117 | toggle_options = ["merged", "cortex", "partner", "remove"] 118 | metadata = {} 119 | 120 | # Create metadata based for each field given merging or singular semantic file useage of the field 121 | # Merge will merge the 2 based on user-selected preference 122 | if self.cortex_metadata and self.partner_metadata: 123 | metadata["merged"] = self.cortex_metadata.copy() 124 | if st.session_state["partner_metadata_preference"] == "Partner": 125 | metadata["merged"] = { 126 | k: v for k, v in self.cortex_metadata.items() if v 127 | } | {k: v for k, v in self.partner_metadata.items() if v} 128 | else: 129 | metadata["merged"] = { 130 | k: v for k, v in self.partner_metadata.items() if v 131 | } | {k: v for k, v in self.cortex_metadata.items() if v} 132 | 133 | else: 134 | metadata["merged"] = {} 135 | metadata["partner"] = self.partner_metadata if self.partner_metadata else {} 136 | metadata["cortex"] = self.cortex_metadata if self.cortex_metadata else {} 137 | metadata["remove"] = {} 138 | 139 | if metadata["merged"]: 140 | toggle_default = "merged" 141 | elif metadata["partner"]: 142 | if st.session_state["keep_extra_partner"]: 143 | toggle_default = "partner" 144 | else: 145 | toggle_default = "remove" 146 | elif metadata["cortex"]: 147 | if st.session_state["keep_extra_cortex"]: 148 | toggle_default = "cortex" 149 | else: 150 | toggle_default = "remove" 151 | else: 152 | toggle_default = "remove" 153 | 154 | key_col, detail_col = st.columns((0.5, 1)) 155 | with key_col: 156 | st.write(self.key) 157 | # We want to disable non-options but always keep remove option 158 | revised_options = [ 159 | i for i in toggle_options if metadata[i] or i == "remove" 160 | ] 161 | detail_selection: str = st.radio( 162 | "Keep", # type: ignore 163 | index=revised_options.index(toggle_default), 164 | options=revised_options, 165 | key=f"row_{self.key}", 166 | format_func=lambda x: x.capitalize(), 167 | label_visibility="collapsed", 168 | ) 169 | with detail_col: 170 | if metadata[detail_selection]: 171 | # Only printing string valued keys for now 172 | st.json( 173 | { 174 | k: v 175 | for k, v in metadata[detail_selection].items() 176 | if isinstance(v, str) 177 | } 178 | ) 179 | else: 180 | st.write("NA") 181 | st.divider() 182 | # Extract the selected metadata if not set to remove 183 | if detail_selection != "remove": 184 | selected_metadata: dict[str, Any] = metadata[detail_selection] 185 | # Add expr to selected metadata if it's not included which is the case for dbt 186 | selected_metadata["expr"] = self.key 187 | return selected_metadata 188 | 189 | 190 | def compare_sections(section_cortex: str, section_partner: str) -> str: 191 | """ 192 | Compares section_cortex and section_parnter returning the former if available. 193 | Otherwise, returns the latter. 194 | 195 | Args: 196 | section_cortex (str): The Cortex section of the Cortex field if found. 197 | section_cortex (str): The Cortex section of the Partner field if found. 198 | 199 | Returns: 200 | str: Cortex section name. 201 | """ 202 | 203 | if section_cortex: 204 | return section_cortex 205 | else: 206 | return section_partner 207 | 208 | 209 | def compare_data_types( 210 | details_cortex: dict[str, Any], details_partner: dict[str, Any] 211 | ) -> Any: 212 | """ 213 | Returns intended cortex datatype comparing cortex and partner datatype values. 214 | 215 | Args: 216 | details_cortex (dict[str, Any]): Dictionary of Cortex field metadata. 217 | details_partner (dict[str, Any]): Dictionary of Parnter's Cortex field metadata. 218 | 219 | Returns: 220 | str: Cortex data_type. 221 | """ 222 | 223 | cortex_data_type = None 224 | partner_data_type = None 225 | 226 | if isinstance(details_cortex, dict): 227 | cortex_data_type = details_cortex.get("data_type", None) 228 | if isinstance(details_partner, dict): 229 | partner_data_type = details_partner.get("data_type", None) 230 | 231 | if cortex_data_type: 232 | return cortex_data_type 233 | elif partner_data_type: 234 | return partner_data_type 235 | else: 236 | return "TEXT" 237 | 238 | 239 | @st.experimental_dialog("Integrate partner tool semantic specs", width="large") 240 | def integrate_partner_semantics() -> None: 241 | """ 242 | Runs UI module for comparing Cortex and Partner fields for integration. 243 | 244 | Returns: 245 | None 246 | """ 247 | 248 | st.write( 249 | "Specify how to merge semantic metadata from your selected partner tool with Cortex Analyst's semantic model." 250 | ) 251 | 252 | st.write(f"Partner: **{st.session_state.get('selected_partner', None)}**") 253 | 254 | COMPARE_SEMANTICS_HELP = """Which semantic file should be checked first for necessary metadata. 255 | Where metadata is missing, the other semantic file will be checked.""" 256 | 257 | INTEGRATE_HELP = ( 258 | """Merge the selected Snowflake and Partner tables' semantics together.""" 259 | ) 260 | 261 | SAVE_HELP = """Save the merges to the Cortex Analyst semantic model for validation and iteration.""" 262 | 263 | KEEP_CORTEX_HELP = """Retain fields that are found in Cortex Analyst semantic model 264 | but not in Partner semantic model.""" 265 | 266 | KEEP_PARTNER_HELP = """Retain fields that are found in Partner semantic model 267 | but not in Cortex Analyst semantic model.""" 268 | 269 | if st.session_state.get("partner_setup", False): 270 | # Execute pre-processing behind the scenes based on vendor tool 271 | CortexSemanticTable.create_cortex_table_list() 272 | 273 | if ( 274 | st.session_state.get("selected_partner", None) 275 | == PartnerTool.LOOKER_EXPLORE.value 276 | ): 277 | from partner.looker import LookerSemanticTable 278 | 279 | LookerSemanticTable.create_cortex_table_list() 280 | elif ( 281 | st.session_state.get("selected_partner", None) 282 | == PartnerTool.DBT_SEMANTIC_MODEL.value 283 | ): 284 | pass 285 | else: 286 | st.error("Selected partner tool not available.") 287 | 288 | # Create table selections for comparison 289 | partner_tables = [ 290 | model.get_name() for model in st.session_state["partner_semantic"] 291 | ] 292 | cortex_tables = [ 293 | table.get_name() for table in st.session_state["cortex_comparison_tables"] 294 | ] 295 | 296 | st.write("Select which logical tables/views to compare and merge.") 297 | c1, c2 = st.columns(2) 298 | with c1: 299 | semantic_cortex_tbl: str = st.selectbox("Snowflake", cortex_tables) # type: ignore 300 | with c2: 301 | semantic_partner_tbl: str = st.selectbox("Partner", partner_tables) # type: ignore 302 | 303 | st.session_state["partner_metadata_preference"] = st.selectbox( 304 | "For fields shared in both sources, which source should be checked first for common metadata?", 305 | ["Partner", "Cortex"], 306 | index=0, 307 | help=COMPARE_SEMANTICS_HELP, 308 | ) 309 | orphan_label, orphan_col1, orphan_col2 = st.columns(3, gap="small") 310 | with orphan_label: 311 | st.write("Retain unmatched fields:") 312 | with orphan_col1: 313 | st.session_state["keep_extra_cortex"] = st.toggle( 314 | "Cortex", value=True, help=KEEP_CORTEX_HELP 315 | ) 316 | with orphan_col2: 317 | st.session_state["keep_extra_partner"] = st.toggle( 318 | "Partner", value=True, help=KEEP_PARTNER_HELP 319 | ) 320 | with st.expander("Advanced configuration", expanded=False): 321 | # Create dataframe of each semantic file's fields with mergeable keys 322 | st.caption("Only shared metadata information displayed") 323 | cortex_fields_df = CortexSemanticTable.retrieve_df_by_name( 324 | semantic_cortex_tbl 325 | ) 326 | 327 | if ( 328 | st.session_state.get("selected_partner", None) 329 | == PartnerTool.LOOKER_EXPLORE.value 330 | ): 331 | from partner.looker import LookerSemanticTable 332 | 333 | partner_fields_df = LookerSemanticTable.retrieve_df_by_name( 334 | semantic_partner_tbl 335 | ) 336 | if ( 337 | st.session_state.get("selected_partner", None) 338 | == PartnerTool.DBT_SEMANTIC_MODEL.value 339 | ): 340 | partner_fields_df = DBTSemanticModel.retrieve_df_by_name( 341 | semantic_partner_tbl 342 | ) 343 | 344 | combined_fields_df = cortex_fields_df.merge( 345 | partner_fields_df, 346 | on="field_key", 347 | how="outer", 348 | suffixes=("_cortex", "_partner"), 349 | ).replace( 350 | np.nan, None 351 | ) # Will be comparing values to None in UI logic 352 | 353 | # Convert json strings to dict for easier extraction later 354 | for col in ["field_details_cortex", "field_details_partner"]: 355 | combined_fields_df[col] = combined_fields_df[col].apply( 356 | lambda x: ( 357 | json.loads(x) 358 | if not pd.isnull(x) and not isinstance(x, dict) 359 | else x 360 | ) 361 | ) 362 | 363 | # Create containers and store them in a dictionary 364 | containers = { 365 | "dimensions": st.container(), 366 | "measures": st.container(), 367 | "time_dimensions": st.container(), 368 | } 369 | 370 | # Assign labels to the containers 371 | for key in containers.keys(): 372 | containers[key].write(f"**{key.replace('_', ' ').title()}**") 373 | 374 | # Initialize sections as empty lists 375 | sections: dict[str, list[dict[str, Any]]] = { 376 | key: [] for key in containers.keys() 377 | } 378 | for k, v in combined_fields_df.iterrows(): 379 | # Get destination section and intended data type for cortex analyst semantic file 380 | # If the key is found from the generator, use it. Otherwise, use the partner-specific logic. 381 | target_section = compare_sections( 382 | v["section_cortex"], v["section_partner"] 383 | ) 384 | target_data_type = compare_data_types( 385 | v["field_details_cortex"], v["field_details_partner"] 386 | ) 387 | with containers[target_section]: 388 | selected_metadata = PartnerCompareRow(v).render_row() 389 | if selected_metadata: 390 | selected_metadata["data_type"] = target_data_type 391 | sections[target_section].append(selected_metadata) 392 | 393 | integrate_col, commit_col, _ = st.columns((1, 1, 5), gap="small") 394 | with integrate_col: 395 | merge_button = st.button( 396 | "Merge", help=INTEGRATE_HELP, use_container_width=True 397 | ) 398 | with commit_col: 399 | reset_button = st.button( 400 | "Save", 401 | help=SAVE_HELP, 402 | use_container_width=True, 403 | ) 404 | 405 | if merge_button: 406 | set_sit_query_tag( 407 | get_snowflake_connection(), 408 | vendor=st.session_state["selected_partner"], 409 | action="merge", 410 | ) 411 | # Update fields in cortex semantic model 412 | for i, tbl in enumerate(st.session_state["cortex_comparison_tables"]): 413 | if tbl.get_name() == semantic_cortex_tbl: 414 | for k in sections.keys(): 415 | st.session_state["current_yaml_as_dict"]["tables"][i][k] = ( 416 | sections[k] 417 | ) 418 | 419 | try: 420 | st.session_state["yaml"] = yaml.dump( 421 | st.session_state["current_yaml_as_dict"], sort_keys=False 422 | ) 423 | st.session_state["semantic_model"] = yaml_to_semantic_model( 424 | st.session_state["yaml"] 425 | ) 426 | merge_msg = st.success("Merging...") 427 | time.sleep(1) 428 | merge_msg.empty() 429 | except Exception as e: 430 | st.error(f"Integration failed: {e}") 431 | 432 | if reset_button: 433 | set_sit_query_tag( 434 | get_snowflake_connection(), 435 | vendor=st.session_state["selected_partner"], 436 | action="integration_complete", 437 | ) 438 | st.success( 439 | "Integration complete! Please validate your semantic model before uploading." 440 | ) 441 | time.sleep(1.5) 442 | st.rerun() # Lazy alternative to resetting all configurations 443 | else: 444 | st.error("Partner semantic not setup.") 445 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.poetry] 2 | name = "semantic-model-generator" 3 | version = "1.0.0" 4 | description = "Curate a Semantic Model for Snowflake Cortex Analyst" 5 | authors = ["Jonathan Hilgart ", "Nipun Sehrawat ", "Renee Huang ", "Nicole Limtiaco "] 6 | license = "Apache Software License; BSD License" 7 | readme = "README.md" 8 | 9 | [tool.poetry.dependencies] 10 | python = ">=3.9,<3.9.7 || >3.9.7,<3.12" 11 | pandas = "^2.0.1" 12 | loguru = "^0.7.2" 13 | snowflake-connector-python = { extras = ["secure-local-storage", "pandas"], version = "^3.11.0" } 14 | protobuf = "5.26.1" 15 | pydantic = "2.8.2" 16 | PyYAML = "^6.0.1" 17 | "ruamel.yaml" = "0.17.21" 18 | tqdm = "^4.66.5" 19 | pyarrow = "14.0.2" 20 | sqlglot = "25.10.0" 21 | strictyaml = "^1.7.3" 22 | streamlit = "1.36.0" 23 | streamlit-extras = "0.4.0" 24 | numpy = "^1.26.4" 25 | python-dotenv = "^1.0.1" 26 | urllib3 = "^1.26.19" 27 | requests = "^2.32.3" 28 | snowflake-snowpark-python = "1.18.0" 29 | 30 | # Optional dependencies for functionality such as partner semantic model support. 31 | looker-sdk = { version = "^24.14.0", optional = true } 32 | 33 | [tool.poetry.group.dev.dependencies] 34 | mypy = "^1.9.0" 35 | black = "^24.3.0" 36 | isort = "^5.13.2" 37 | flake8 = "^7.0.0" 38 | pytest = "^8.1.1" 39 | types-pyyaml = "^6.0.12.20240311" 40 | types-protobuf = "^4.24.0.20240311" 41 | pip-licenses = "^4.4.0" 42 | grpcio-tools = "1.64.1" 43 | 44 | [tool.poetry.extras] 45 | looker = ["looker-sdk"] 46 | 47 | [build-system] 48 | requires = ["poetry-core"] 49 | build-backend = "poetry.core.masonry.api" 50 | -------------------------------------------------------------------------------- /semantic_model_generator/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/semantic_model_generator/__init__.py -------------------------------------------------------------------------------- /semantic_model_generator/data_processing/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/semantic_model_generator/data_processing/__init__.py -------------------------------------------------------------------------------- /semantic_model_generator/data_processing/cte_utils.py: -------------------------------------------------------------------------------- 1 | # TODO: Add tests for quoted columns, which are not well tested today. 2 | 3 | import copy 4 | from typing import List, Optional 5 | 6 | import sqlglot 7 | import sqlglot.expressions 8 | from loguru import logger 9 | from sqlglot.dialects.snowflake import Snowflake 10 | 11 | from semantic_model_generator.protos import semantic_model_pb2 12 | from semantic_model_generator.snowflake_utils.snowflake_connector import ( 13 | OBJECT_DATATYPES, 14 | ) 15 | 16 | _LOGICAL_TABLE_PREFIX = "__" 17 | 18 | 19 | def is_logical_table(table_name: str) -> bool: 20 | """Returns true if 'table_name' is a logical table name.""" 21 | return table_name.startswith(_LOGICAL_TABLE_PREFIX) and len(table_name) > len( 22 | _LOGICAL_TABLE_PREFIX 23 | ) 24 | 25 | 26 | def logical_table_name(table: semantic_model_pb2.Table) -> str: 27 | """Returns the name of logical table for a given table. E.g. __fact""" 28 | return _LOGICAL_TABLE_PREFIX + table.name # type: ignore[no-any-return] 29 | 30 | 31 | def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str: 32 | """Returns fully qualified table name such as my_db.my_schema.my_table""" 33 | fqn = table.table 34 | if len(table.schema) > 0: 35 | fqn = f"{table.schema}.{fqn}" 36 | if len(table.database) > 0: 37 | fqn = f"{table.database}.{fqn}" 38 | return fqn # type: ignore[no-any-return] 39 | 40 | 41 | def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool: 42 | """Check if an expr contains aggregation function. 43 | Note: only flag True for aggregations that would changes number of rows of data. 44 | For window function, given the operation will produce value per row, mark as False here. 45 | 46 | Raises: 47 | ValueError: if expr is not parsable, or if aggregation expressions in non-measure columns. 48 | """ 49 | parsed = sqlglot.parse_one(col.expr, dialect=Snowflake) 50 | agg_func = list(parsed.find_all(sqlglot.expressions.AggFunc)) 51 | window = list(parsed.find_all(sqlglot.expressions.Window)) 52 | # We've confirmed window functions cannot appear inside aggregate functions 53 | # (gets execution error msg: Window function [SUM(...) OVER (PARTITION BY ...)] may not appear inside an aggregate function). 54 | # So if there's a window function present there can't also be an aggregate function applied to the window function. 55 | if len(agg_func) > 0 and len(window) == 0: 56 | if col.kind != 2: 57 | raise ValueError("Only allow aggregation expressions for measures.") 58 | return True 59 | return False 60 | 61 | 62 | def _is_physical_table_column(col: semantic_model_pb2.Column) -> bool: 63 | """Returns whether the column refers to a single raw table column.""" 64 | try: 65 | parsed = sqlglot.parse_one(col.expr, dialect=Snowflake) 66 | return isinstance(parsed, sqlglot.expressions.Column) 67 | except Exception as ex: 68 | logger.warning( 69 | f"Failed to parse sql expression: {col.expr}. Error: {ex}. {col}" 70 | ) 71 | return False 72 | 73 | 74 | def _is_identifier_quoted(col_name: str) -> bool: 75 | return '"' in col_name 76 | 77 | 78 | def remove_ltable_cte(sql_w_ltable_cte: str, table_names: list[str]) -> str: 79 | """ 80 | Given a SQL with prefix'd logical table conversion CTE(s), remove the logical table conversions. 81 | Args: 82 | sql_w_ltable_cte: the sql with logical table conversion CTE(s). 83 | table_names: list of tables in the semantic model. 84 | 85 | Returns: the sql without the logical table conversion CTE. 86 | Raises: ValueError if didn't find any CTE or parsed first CTE is not logical table CTE. 87 | """ 88 | ast = sqlglot.parse_one(sql_w_ltable_cte, read=Snowflake) 89 | with_ = ast.args.get("with") 90 | if with_ is None: 91 | raise ValueError("Analyst queries must contain the logical CTE.") 92 | if not is_logical_table(with_.expressions[0].alias): 93 | raise ValueError("Analyst queries must contain the logical CTE.") 94 | 95 | table_names_lower = [table_name.lower() for table_name in table_names] 96 | # Iterate through all CTEs, and filter out logical table CTEs. 97 | # This is done by checking if the CTE alias starts with the logical table prefix and if the alias is in a table in the semantic model. 98 | non_logical_cte = [ 99 | cte 100 | for cte in with_.expressions 101 | if not is_logical_table(cte.alias) 102 | or cte.alias.replace(_LOGICAL_TABLE_PREFIX, "").lower() not in table_names_lower 103 | ] 104 | 105 | # Replace the original expressions list with the filtered list 106 | with_.set("expressions", non_logical_cte) 107 | 108 | # If no expressions are left for whatever reason, remove the entire WITH clause. 109 | if not with_.expressions: 110 | ast.set("with", None) 111 | 112 | sql_without_logical_cte = ast.sql(dialect=Snowflake, pretty=True) 113 | return sql_without_logical_cte # type: ignore [no-any-return] 114 | 115 | 116 | def _validate_col(column: semantic_model_pb2.Column) -> None: 117 | if " " in column.name.strip(): 118 | raise ValueError( 119 | f"Please do not include spaces in your column name: {column.name}" 120 | ) 121 | if column.data_type.upper() in OBJECT_DATATYPES: 122 | raise ValueError( 123 | f"We do not support object datatypes in the semantic model. Col {column.name} has data type {column.data_type}. Please remove this column from your semantic model or flatten it to non-object type." 124 | ) 125 | 126 | 127 | def validate_all_cols(table: semantic_model_pb2.Table) -> None: 128 | for column in table.columns: 129 | _validate_col(column) 130 | 131 | 132 | def _get_col_expr(column: semantic_model_pb2.Column) -> str: 133 | """Return column expr in SQL format. 134 | Raise errors if columns is of OBJECT_DATATYPES, which we do not support today.""" 135 | return ( 136 | f"{column.expr.strip()} as {column.name.strip()}" 137 | if column.expr.strip().lower() != column.name.strip().lower() 138 | else f"{column.expr.strip()}" 139 | ) 140 | 141 | 142 | def _generate_cte_for( 143 | table: semantic_model_pb2.Table, columns: List[semantic_model_pb2.Column] 144 | ) -> str: 145 | """ 146 | Returns a CTE representing a logical table that selects 'col' columns from 'table'. 147 | """ 148 | 149 | if len(columns) == 0: 150 | raise ValueError("Please include at least one column to generate CTE on.") 151 | else: 152 | expr_columns = [_get_col_expr(col) for col in columns] 153 | cte = f"WITH {logical_table_name(table)} AS (\n" 154 | cte += "SELECT \n" 155 | cte += ",\n".join(expr_columns) + "\n" 156 | cte += f"FROM {fully_qualified_table_name(table.base_table)}" 157 | cte += ")" 158 | return cte 159 | 160 | 161 | def get_all_physical_column_references( 162 | column: semantic_model_pb2.Column, 163 | ) -> List[str]: 164 | """Returns a set of column names referenced in the column expression. 165 | 166 | For example, the following column expressions yield the following return values: 167 | foo -> [foo] 168 | foo+bar -> [foo, bar] 169 | sum(foo) -> [foo] 170 | """ 171 | try: 172 | parsed = sqlglot.parse_one(column.expr, dialect=Snowflake) 173 | col_names = set() 174 | for col in parsed.find_all(sqlglot.expressions.Column): 175 | # TODO(renee): Handle quoted columns. 176 | col_name = col.name.lower() 177 | if col.this.quoted: 178 | col_name = col.name 179 | col_names.add(col_name) 180 | return sorted(list(col_names)) 181 | except Exception as ex: 182 | raise ValueError(f"Failed to parse sql expression: {column.expr}. Error: {ex}") 183 | 184 | 185 | def direct_mapping_logical_columns( 186 | table: semantic_model_pb2.Table, 187 | ) -> List[semantic_model_pb2.Column]: 188 | """ 189 | Returns a list of logical columns that map 1:1 to an underlying physical column 190 | (i.e. logical table's expression is simply the physical column name) in this table. 191 | """ 192 | ret: List[semantic_model_pb2.Column] = [] 193 | for c in table.columns: 194 | if _is_physical_table_column(c): 195 | ret.append(c) 196 | return ret 197 | 198 | 199 | def _enrich_column_in_expr_with_aggregation( 200 | table: semantic_model_pb2.Table, 201 | ) -> semantic_model_pb2.Table: 202 | """ 203 | Expands the logical columns of 'table' to include columns mentioned in a logical columns 204 | with an aggregate expression. E.g. for a logical column called CPC with expr sum(cost) / sum(clicks), 205 | adds logical columns for "cost" and "clicks", if not present. 206 | """ 207 | direct_mapping_lcols = [ 208 | c.name.lower() for c in direct_mapping_logical_columns(table) 209 | ] 210 | cols_to_append = set() 211 | for col in table.columns: 212 | if not is_aggregation_expr(col): 213 | continue 214 | for pcol in get_all_physical_column_references(col): 215 | # If the physical column doesn't have a direct mapping logical column 216 | # with the same name, then we need to add a new logical column for it. 217 | # Note that this may introduce multiple logical columns directly referencing 218 | # the same physical column, something we should improve up, perhaps by 219 | # rewriting the expression to use existing direct mapping logical columns 220 | # whenever preset. 221 | if pcol not in direct_mapping_lcols: 222 | cols_to_append.add(pcol) 223 | 224 | original_cols = {col.name.lower(): col.expr for col in table.columns} 225 | ret = copy.deepcopy(table) 226 | # Insert in sorted order to make this method deterministic. 227 | for c in sorted(cols_to_append): 228 | if c in original_cols: 229 | logger.warning( 230 | f"Not adding a logical column for physical column {c} in table {table.name}, " 231 | f"since this logical column already exists with expression {original_cols[c]}" 232 | ) 233 | else: 234 | new_col = semantic_model_pb2.Column(name=c, expr=c) 235 | ret.columns.append(new_col) 236 | return ret 237 | 238 | 239 | def _generate_non_agg_cte(table: semantic_model_pb2.Table) -> Optional[str]: 240 | """ 241 | Returns a CTE representing a logical table that selects 'col' columns from 'table' except for aggregation columns. 242 | """ 243 | filtered_cols = [col for col in table.columns if not is_aggregation_expr(col)] 244 | if len(filtered_cols) > 0: 245 | return _generate_cte_for(table, filtered_cols) 246 | else: 247 | return None 248 | 249 | 250 | def _convert_to_snowflake_sql(sql: str) -> str: 251 | """ 252 | Converts a given SQL statement to Snowflake SQL syntax using SQLGlot. 253 | 254 | Args: 255 | sql (str): The SQL statement to convert. 256 | 257 | Returns: 258 | str: The SQL statement in Snowflake syntax. 259 | """ 260 | try: 261 | expression = sqlglot.parse_one(sql, dialect=Snowflake) 262 | except Exception as e: 263 | raise ValueError( 264 | f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}" 265 | ) 266 | 267 | return expression.sql() 268 | 269 | 270 | def generate_select( 271 | table_in_column_format: semantic_model_pb2.Table, limit: int 272 | ) -> List[str]: 273 | """Generate select query for all columns for validation purpose.""" 274 | sqls_to_return: List[str] = [] 275 | # Generate select query for columns without aggregation exprs. 276 | non_agg_cte = _generate_non_agg_cte(table_in_column_format) 277 | if non_agg_cte is not None: 278 | non_agg_sql = ( 279 | non_agg_cte 280 | + f"SELECT * FROM {logical_table_name(table_in_column_format)} LIMIT {limit}" 281 | ) 282 | sqls_to_return.append(_convert_to_snowflake_sql(non_agg_sql)) 283 | 284 | # Generate select query for columns with aggregation exprs. 285 | agg_cols = [ 286 | col for col in table_in_column_format.columns if is_aggregation_expr(col) 287 | ] 288 | if len(agg_cols) == 0: 289 | return sqls_to_return 290 | else: 291 | agg_cte = _generate_cte_for(table_in_column_format, agg_cols) 292 | agg_sql = ( 293 | agg_cte 294 | + f"SELECT * FROM {logical_table_name(table_in_column_format)} LIMIT {limit}" 295 | ) 296 | sqls_to_return.append(_convert_to_snowflake_sql(agg_sql)) 297 | return sqls_to_return 298 | 299 | 300 | def expand_all_logical_tables_as_ctes( 301 | sql_query: str, model_in_column_format: semantic_model_pb2.SemanticModel 302 | ) -> str: 303 | """ 304 | Returns a SQL query that expands all logical tables contained in ctx as ctes. 305 | """ 306 | 307 | def generate_full_logical_table_ctes( 308 | ctx: semantic_model_pb2.SemanticModel, 309 | ) -> List[str]: 310 | """ 311 | Given an arbitrary SQL, returns a list of CTEs representing all the logical tables 312 | referenced in it. 313 | """ 314 | ctes: List[str] = [] 315 | for table in ctx.tables: 316 | # Append all columns and expressions for the logical table. 317 | # If table contains expr with aggregations, enrich its referred columns into the table. 318 | table_ = _enrich_column_in_expr_with_aggregation(table) 319 | cte = _generate_non_agg_cte(table_) 320 | if cte is not None: 321 | ctes.append(cte) 322 | return ctes 323 | 324 | # Step 1: Generate a CTE for each logical table referenced in the query. 325 | ctes = generate_full_logical_table_ctes(model_in_column_format) 326 | 327 | # Step 2: Parse each generated CTE as a 'WITH' clause. 328 | new_withs = [] 329 | for cte in ctes: 330 | new_withs.append( 331 | sqlglot.parse_one(cte, read=Snowflake, into=sqlglot.expressions.With) 332 | ) 333 | 334 | # Step 3: Prefix the CTEs to the original query. 335 | ast = sqlglot.parse_one(sql_query, read=Snowflake) 336 | with_ = ast.args.get("with") 337 | # If the query doesn't have a WITH clause, then generate one. 338 | if with_ is None: 339 | merged_with = new_withs[0] 340 | remaining_ctes = [w.expressions[0] for w in new_withs[1:]] 341 | merged_with.set("expressions", merged_with.expressions + remaining_ctes) 342 | ast.set("with", merged_with) 343 | # If the query already has a WITH clause, prefix the CTEs to it. 344 | else: 345 | new_ctes = [w.expressions[0] for w in new_withs] 346 | with_.set("expressions", new_ctes + with_.expressions) 347 | return ast.sql(dialect=Snowflake, pretty=True) # type: ignore [no-any-return] 348 | 349 | 350 | def context_to_column_format( 351 | ctx: semantic_model_pb2.SemanticModel, 352 | ) -> semantic_model_pb2.SemanticModel: 353 | """ 354 | Converts semantic_model_pb2.SemanticModel from a dimension/measure format to a column format. 355 | Returns a new semantic_model_pb2.SemanticModel object that's in column format. 356 | """ 357 | ret = semantic_model_pb2.SemanticModel() 358 | ret.CopyFrom(ctx) 359 | for table in ret.tables: 360 | column_format = len(table.columns) > 0 361 | dimension_measure_format = ( 362 | len(table.dimensions) > 0 363 | or len(table.time_dimensions) > 0 364 | or len(table.measures) > 0 365 | ) 366 | if column_format and dimension_measure_format: 367 | raise ValueError( 368 | "table {table.name} defines both columns and dimensions/time_dimensions/measures." 369 | ) 370 | if column_format: 371 | continue 372 | for d in table.dimensions: 373 | col = semantic_model_pb2.Column() 374 | col.kind = semantic_model_pb2.ColumnKind.dimension 375 | col.name = d.name 376 | col.synonyms.extend(d.synonyms) 377 | col.description = d.description 378 | col.expr = d.expr 379 | col.data_type = d.data_type 380 | col.unique = d.unique 381 | col.sample_values.extend(d.sample_values) 382 | table.columns.append(col) 383 | del table.dimensions[:] 384 | 385 | for td in table.time_dimensions: 386 | col = semantic_model_pb2.Column() 387 | col.kind = semantic_model_pb2.ColumnKind.time_dimension 388 | col.name = td.name 389 | col.synonyms.extend(td.synonyms) 390 | col.description = td.description 391 | col.expr = td.expr 392 | col.data_type = td.data_type 393 | col.unique = td.unique 394 | col.sample_values.extend(td.sample_values) 395 | table.columns.append(col) 396 | del table.time_dimensions[:] 397 | 398 | for m in table.measures: 399 | col = semantic_model_pb2.Column() 400 | col.kind = semantic_model_pb2.ColumnKind.measure 401 | col.name = m.name 402 | col.synonyms.extend(m.synonyms) 403 | col.description = m.description 404 | col.expr = m.expr 405 | col.data_type = m.data_type 406 | col.default_aggregation = m.default_aggregation 407 | col.sample_values.extend(m.sample_values) 408 | table.columns.append(col) 409 | del table.measures[:] 410 | return ret 411 | -------------------------------------------------------------------------------- /semantic_model_generator/data_processing/cte_utils_test.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | import pytest 4 | 5 | from semantic_model_generator.data_processing.cte_utils import remove_ltable_cte 6 | 7 | 8 | class TestRemoveLogicalTableCTE: 9 | def test_removes_logical_table_cte(self) -> None: 10 | """ 11 | Testing that we remove logical table CTEs corresponding to existing table names. 12 | """ 13 | query = "WITH __logical_table AS (SELECT * FROM table1) SELECT * FROM __logical_table" 14 | table_names = ["LOGICAL_TABLE"] 15 | expected_query = "SELECT * FROM __logical_table" 16 | 17 | actual_output = remove_ltable_cte(query, table_names=table_names) 18 | actual_output = re.sub(r"\s+", " ", actual_output) 19 | 20 | assert actual_output == expected_query 21 | 22 | def test_does_not_remove_non_logical_cte(self) -> None: 23 | """ 24 | Testing that CTEs not mapping to existing table names are not removed. 25 | """ 26 | query = ( 27 | "WITH __other_table AS (SELECT * FROM table1) SELECT * FROM __other_table" 28 | ) 29 | table_names = ["LOGICAL_TABLE"] 30 | expected_query = ( 31 | "WITH __other_table AS ( SELECT * FROM table1 ) SELECT * FROM __other_table" 32 | ) 33 | 34 | actual_output = remove_ltable_cte(query, table_names=table_names) 35 | actual_output = re.sub(r"\s+", " ", actual_output) 36 | 37 | assert actual_output == expected_query 38 | 39 | def test_mixed_ctes(self) -> None: 40 | """ 41 | Given a query containing a mixture of CTEs, only the logical table CTEs should be removed. 42 | """ 43 | query = "WITH __logical_table AS (SELECT * FROM table1), __other_table AS (SELECT * FROM table2), __custom_table AS (SELECT * FROM table3) SELECT * FROM __logical_table" 44 | table_names = ["LOGICAL_TABLE"] 45 | expected_query = "WITH __other_table AS ( SELECT * FROM table2 ), __custom_table AS ( SELECT * FROM table3 ) SELECT * FROM __logical_table" 46 | 47 | actual_output = remove_ltable_cte(query, table_names=table_names) 48 | actual_output = re.sub(r"\s+", " ", actual_output) 49 | 50 | assert actual_output == expected_query 51 | 52 | def test_throws_value_error_without_cte(self) -> None: 53 | """ 54 | Testing that an error is thrown if there is no CTE in the query. 55 | """ 56 | query = "SELECT * FROM table1" 57 | table_names = ["LOGICAL_TABLE"] 58 | 59 | with pytest.raises(ValueError): 60 | remove_ltable_cte(query, table_names=table_names) 61 | 62 | def test_throws_value_error_if_first_cte_not_logical_table(self) -> None: 63 | """ 64 | Testing that an error is thrown if the first CTE is not a logical table. 65 | """ 66 | query = "WITH random_alias AS (SELECT * FROM table1), __logical_table AS (SELECT * FROM table2) SELECT * FROM __logical_table" 67 | table_names = ["LOGICAL_TABLE"] 68 | 69 | with pytest.raises(ValueError): 70 | remove_ltable_cte(query, table_names=table_names) 71 | -------------------------------------------------------------------------------- /semantic_model_generator/data_processing/data_types.py: -------------------------------------------------------------------------------- 1 | from typing import Any, List, Optional 2 | 3 | from pydantic.dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class FQNParts: 8 | database: str 9 | schema_name: str 10 | table: str 11 | 12 | def __post_init__(self: Any) -> None: 13 | """Uppercase table name""" 14 | self.table = self.table.upper() 15 | 16 | 17 | @dataclass 18 | class Column: 19 | id_: int 20 | column_name: str 21 | column_type: str 22 | values: Optional[List[str]] = None 23 | comment: Optional[str] = ( 24 | None # comment field's to save the column comment user specified on the column 25 | ) 26 | 27 | def __post_init__(self: Any) -> None: 28 | """ 29 | Update column_type to cleaned up version, eg. NUMBER(38,0) -> NUMBER 30 | """ 31 | 32 | self.column_type = self.column_type.split("(")[0].strip().upper() 33 | 34 | 35 | @dataclass 36 | class Table: 37 | id_: int 38 | name: str 39 | columns: List[Column] 40 | comment: Optional[str] = ( 41 | None # comment field's to save the table comment user specified on the table 42 | ) 43 | 44 | def __post_init__(self: Any) -> None: 45 | for col in self.columns: 46 | if col.column_name == "": 47 | raise ValueError("column name in table must be nonempty") 48 | -------------------------------------------------------------------------------- /semantic_model_generator/data_processing/proto_utils.py: -------------------------------------------------------------------------------- 1 | import io 2 | import json 3 | from typing import Any, TypeVar 4 | 5 | import ruamel.yaml 6 | from google.protobuf import json_format 7 | from google.protobuf.message import Message 8 | from strictyaml import dirty_load 9 | 10 | from semantic_model_generator.protos import semantic_model_pb2 11 | from semantic_model_generator.validate.schema import SCHEMA 12 | 13 | ProtoMsg = TypeVar("ProtoMsg", bound=Message) 14 | 15 | 16 | def proto_to_yaml(message: ProtoMsg) -> str: 17 | """Serializes the input proto into a yaml message. 18 | 19 | Args: 20 | message: Protobuf message to be serialized. 21 | 22 | Returns: 23 | The serialized yaml string, or None if an error occurs. 24 | """ 25 | try: 26 | json_data = json.loads( 27 | json_format.MessageToJson(message, preserving_proto_field_name=True) 28 | ) 29 | 30 | # Using ruamel.yaml package to preserve message order. 31 | yaml = ruamel.yaml.YAML() 32 | yaml.indent(mapping=2, sequence=4, offset=2) 33 | yaml.preserve_quotes = True 34 | 35 | with io.StringIO() as stream: 36 | yaml.dump(json_data, stream) 37 | yaml_str = stream.getvalue() 38 | assert isinstance(yaml_str, str) 39 | return yaml_str 40 | except Exception as e: 41 | raise ValueError(f"Failed to convert protobuf message to YAML: {e}") 42 | 43 | 44 | def proto_to_dict(message: ProtoMsg) -> dict[str, Any]: 45 | """Serializes the input proto into a dictionary. 46 | 47 | Args: 48 | message: Protobuf message to be serialized. 49 | 50 | Returns: 51 | The serialized dictionary, or None if an error occurs. 52 | """ 53 | try: 54 | # Convert the Protobuf message to JSON string. 55 | json_str = json_format.MessageToJson(message, preserving_proto_field_name=True) 56 | 57 | # Convert the JSON string to a Python dictionary. 58 | json_data = json.loads(json_str) 59 | 60 | assert isinstance(json_data, dict) 61 | return json_data 62 | except Exception as e: 63 | raise ValueError(f"Failed to convert protobuf message to dictionary: {e}") 64 | 65 | 66 | def yaml_to_semantic_model(yaml_str: str) -> semantic_model_pb2.SemanticModel: 67 | """ 68 | Deserializes the input yaml into a SemanticModel Protobuf message. The 69 | input yaml must be fully representable as json, so yaml features like 70 | custom types and block scalars are not supported. 71 | 72 | Args: 73 | yaml_str: Path to the YAML file. 74 | 75 | Returns: 76 | The deserialized SemanticModel protobuf message 77 | """ 78 | 79 | # strictyaml is very opinionated on the style of yaml, and rejects yamls that use flow style (e.g. lists with [] 80 | # or maps with {}). See https://hitchdev.com/strictyaml/why/flow-style-removed/. This is purely a style preference 81 | # and those yamls are still parsable. To allow such yamls, we use dirty_load here, which behaves exactly as the 82 | # load method but allows flow style. 83 | parsed_yaml = dirty_load( 84 | yaml_str, SCHEMA, label="semantic model", allow_flow_style=True 85 | ) 86 | msg = semantic_model_pb2.SemanticModel() 87 | return json_format.ParseDict(parsed_yaml.data, msg) 88 | -------------------------------------------------------------------------------- /semantic_model_generator/generate_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | from datetime import datetime 3 | from typing import List, Optional 4 | 5 | from loguru import logger 6 | from snowflake.connector import SnowflakeConnection 7 | 8 | from semantic_model_generator.data_processing import data_types, proto_utils 9 | from semantic_model_generator.protos import semantic_model_pb2 10 | from semantic_model_generator.snowflake_utils.snowflake_connector import ( 11 | AUTOGEN_TOKEN, 12 | DIMENSION_DATATYPES, 13 | MEASURE_DATATYPES, 14 | OBJECT_DATATYPES, 15 | TIME_MEASURE_DATATYPES, 16 | get_table_representation, 17 | get_valid_schemas_tables_columns_df, 18 | ) 19 | from semantic_model_generator.snowflake_utils.utils import create_fqn_table 20 | from semantic_model_generator.validate.context_length import validate_context_length 21 | 22 | _PLACEHOLDER_COMMENT = " " 23 | _FILL_OUT_TOKEN = " # " 24 | # TODO add _AUTO_GEN_TOKEN to the end of the auto generated descriptions. 25 | _AUTOGEN_COMMENT_TOKEN = ( 26 | " # " 27 | ) 28 | _DEFAULT_N_SAMPLE_VALUES_PER_COL = 3 29 | _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n" 30 | 31 | 32 | def _get_placeholder_filter() -> List[semantic_model_pb2.NamedFilter]: 33 | return [ 34 | semantic_model_pb2.NamedFilter( 35 | name=_PLACEHOLDER_COMMENT, 36 | synonyms=[_PLACEHOLDER_COMMENT], 37 | description=_PLACEHOLDER_COMMENT, 38 | expr=_PLACEHOLDER_COMMENT, 39 | ) 40 | ] 41 | 42 | 43 | def _get_placeholder_joins() -> List[semantic_model_pb2.Relationship]: 44 | return [ 45 | semantic_model_pb2.Relationship( 46 | name=_PLACEHOLDER_COMMENT, 47 | left_table=_PLACEHOLDER_COMMENT, 48 | right_table=_PLACEHOLDER_COMMENT, 49 | join_type=semantic_model_pb2.JoinType.inner, 50 | relationship_columns=[ 51 | semantic_model_pb2.RelationKey( 52 | left_column=_PLACEHOLDER_COMMENT, 53 | right_column=_PLACEHOLDER_COMMENT, 54 | ) 55 | ], 56 | relationship_type=semantic_model_pb2.RelationshipType.many_to_one, 57 | ) 58 | ] 59 | 60 | 61 | def _raw_table_to_semantic_context_table( 62 | database: str, schema: str, raw_table: data_types.Table 63 | ) -> semantic_model_pb2.Table: 64 | """ 65 | Converts a raw table representation to a semantic model table in protobuf format. 66 | 67 | Args: 68 | database (str): The name of the database containing the table. 69 | schema (str): The name of the schema containing the table. 70 | raw_table (data_types.Table): The raw table object to be transformed. 71 | 72 | Returns: 73 | semantic_model_pb2.Table: A protobuf representation of the semantic table. 74 | 75 | This function categorizes table columns into TimeDimensions, Dimensions, or Measures based on their data type, 76 | populates them with sample values, and sets placeholders for descriptions and filters. 77 | """ 78 | 79 | # For each column, decide if it is a TimeDimension, Measure, or Dimension column. 80 | # For now, we decide this based on datatype. 81 | # Any time datatype, is TimeDimension. 82 | # Any varchar/text is Dimension. 83 | # Any numerical column is Measure. 84 | 85 | time_dimensions = [] 86 | dimensions = [] 87 | measures = [] 88 | 89 | for col in raw_table.columns: 90 | if col.column_type.upper() in TIME_MEASURE_DATATYPES: 91 | time_dimensions.append( 92 | semantic_model_pb2.TimeDimension( 93 | name=col.column_name, 94 | expr=col.column_name, 95 | data_type=col.column_type, 96 | sample_values=col.values, 97 | synonyms=[_PLACEHOLDER_COMMENT], 98 | description=col.comment if col.comment else _PLACEHOLDER_COMMENT, 99 | ) 100 | ) 101 | 102 | elif col.column_type.upper() in DIMENSION_DATATYPES: 103 | dimensions.append( 104 | semantic_model_pb2.Dimension( 105 | name=col.column_name, 106 | expr=col.column_name, 107 | data_type=col.column_type, 108 | sample_values=col.values, 109 | synonyms=[_PLACEHOLDER_COMMENT], 110 | description=col.comment if col.comment else _PLACEHOLDER_COMMENT, 111 | ) 112 | ) 113 | 114 | elif col.column_type.upper() in MEASURE_DATATYPES: 115 | measures.append( 116 | semantic_model_pb2.Fact( 117 | name=col.column_name, 118 | expr=col.column_name, 119 | data_type=col.column_type, 120 | sample_values=col.values, 121 | synonyms=[_PLACEHOLDER_COMMENT], 122 | description=col.comment if col.comment else _PLACEHOLDER_COMMENT, 123 | ) 124 | ) 125 | elif col.column_type.upper() in OBJECT_DATATYPES: 126 | logger.warning( 127 | f"""We don't currently support {col.column_type} as an input column datatype to the Semantic Model. We are skipping column {col.column_name} for now.""" 128 | ) 129 | continue 130 | else: 131 | logger.warning( 132 | f"Column datatype does not map to a known datatype. Input was = {col.column_type}. We are going to place as a Dimension for now." 133 | ) 134 | dimensions.append( 135 | semantic_model_pb2.Dimension( 136 | name=col.column_name, 137 | expr=col.column_name, 138 | data_type=col.column_type, 139 | sample_values=col.values, 140 | synonyms=[_PLACEHOLDER_COMMENT], 141 | description=col.comment if col.comment else _PLACEHOLDER_COMMENT, 142 | ) 143 | ) 144 | if len(time_dimensions) + len(dimensions) + len(measures) == 0: 145 | raise ValueError( 146 | f"No valid columns found for table {raw_table.name}. Please verify that this table contains column's datatypes not in {OBJECT_DATATYPES}." 147 | ) 148 | 149 | return semantic_model_pb2.Table( 150 | name=raw_table.name, 151 | base_table=semantic_model_pb2.FullyQualifiedTable( 152 | database=database, schema=schema, table=raw_table.name 153 | ), 154 | # For fields we can not automatically infer, leave a comment for the user to fill out. 155 | description=raw_table.comment if raw_table.comment else _PLACEHOLDER_COMMENT, 156 | filters=_get_placeholder_filter(), 157 | dimensions=dimensions, 158 | time_dimensions=time_dimensions, 159 | measures=measures, 160 | ) 161 | 162 | 163 | def raw_schema_to_semantic_context( 164 | base_tables: List[str], 165 | semantic_model_name: str, 166 | conn: SnowflakeConnection, 167 | n_sample_values: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL, 168 | allow_joins: Optional[bool] = False, 169 | ) -> semantic_model_pb2.SemanticModel: 170 | """ 171 | Converts a list of fully qualified Snowflake table names into a semantic model. 172 | 173 | Parameters: 174 | - base_tables (list[str]): Fully qualified table names to include in the semantic model. 175 | - snowflake_account (str): Snowflake account identifier. 176 | - semantic_model_name (str): A meaningful semantic model name. 177 | - conn (SnowflakeConnection): SnowflakeConnection to reuse. 178 | - n_sample_values (int): The number of sample values per col. 179 | 180 | Returns: 181 | - The semantic model (semantic_model_pb2.SemanticModel). 182 | 183 | This function fetches metadata for the specified tables, performs schema validation, extracts key information, 184 | enriches metadata from the Snowflake database, and constructs a semantic model in protobuf format. 185 | It handles different databases and schemas within the same account by creating unique Snowflake connections as needed. 186 | 187 | Raises: 188 | - AssertionError: If no valid tables are found in the specified schema. 189 | """ 190 | 191 | # For FQN tables, create a new snowflake connection per table in case the db/schema is different. 192 | table_objects = [] 193 | unique_database_schema: List[str] = [] 194 | for table in base_tables: 195 | # Verify this is a valid FQN table. For now, we check that the table follows the following format. 196 | # {database}.{schema}.{table} 197 | fqn_table = create_fqn_table(table) 198 | fqn_databse_schema = f"{fqn_table.database}.{fqn_table.schema_name}" 199 | 200 | if fqn_databse_schema not in unique_database_schema: 201 | unique_database_schema.append(fqn_databse_schema) 202 | 203 | logger.info(f"Pulling column information from {fqn_table}") 204 | valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df( 205 | conn=conn, 206 | db_name=fqn_table.database, 207 | table_schema=fqn_table.schema_name, 208 | table_names=[fqn_table.table], 209 | ) 210 | assert not valid_schemas_tables_columns_df.empty 211 | 212 | # get the valid columns for this table. 213 | valid_columns_df_this_table = valid_schemas_tables_columns_df[ 214 | valid_schemas_tables_columns_df["TABLE_NAME"] == fqn_table.table 215 | ] 216 | 217 | raw_table = get_table_representation( 218 | conn=conn, 219 | schema_name=fqn_databse_schema, # Fully-qualified schema 220 | table_name=fqn_table.table, # Non-qualified table name 221 | table_index=0, 222 | ndv_per_column=n_sample_values, # number of sample values to pull per column. 223 | columns_df=valid_columns_df_this_table, 224 | max_workers=1, 225 | ) 226 | table_object = _raw_table_to_semantic_context_table( 227 | database=fqn_table.database, 228 | schema=fqn_table.schema_name, 229 | raw_table=raw_table, 230 | ) 231 | table_objects.append(table_object) 232 | # TODO(jhilgart): Call cortex model to generate a semantically friendly name here. 233 | 234 | placeholder_relationships = _get_placeholder_joins() if allow_joins else None 235 | context = semantic_model_pb2.SemanticModel( 236 | name=semantic_model_name, 237 | tables=table_objects, 238 | relationships=placeholder_relationships, 239 | ) 240 | return context 241 | 242 | 243 | def comment_out_section(yaml_str: str, section_name: str) -> str: 244 | """ 245 | Comments out all lines in the specified section of a YAML string. 246 | 247 | Parameters: 248 | - yaml_str (str): The YAML string to process. 249 | - section_name (str): The name of the section to comment out. 250 | 251 | Returns: 252 | - str: The modified YAML string with the specified section commented out. 253 | """ 254 | updated_yaml = [] 255 | lines = yaml_str.split("\n") 256 | in_section = False 257 | section_indent_level = 0 258 | 259 | for line in lines: 260 | stripped_line = line.strip() 261 | 262 | # When we find a section with the provided name, we can start commenting out lines. 263 | if stripped_line.startswith(f"{section_name}:"): 264 | in_section = True 265 | section_indent_level = len(line) - len(line.lstrip()) 266 | comment_indent = " " * section_indent_level 267 | updated_yaml.append(f"{comment_indent}# {line.strip()}") 268 | continue 269 | 270 | # Since this method parses a raw YAML string, we track whether we're in the section by the indentation level. 271 | # This is a pretty rough heuristic. 272 | current_indent_level = len(line) - len(line.lstrip()) 273 | if ( 274 | in_section 275 | and current_indent_level <= section_indent_level 276 | and stripped_line 277 | ): 278 | in_section = False 279 | 280 | # Comment out the field and its subsections, preserving the indentation level. 281 | if in_section and line.strip(): 282 | comment_indent = " " * current_indent_level 283 | updated_yaml.append(f"{comment_indent}# {line.strip()}") 284 | else: 285 | updated_yaml.append(line) 286 | 287 | return "\n".join(updated_yaml) 288 | 289 | 290 | def append_comment_to_placeholders(yaml_str: str) -> str: 291 | """ 292 | Finds all instances of a specified placeholder in a YAML string and appends a given text to these placeholders. 293 | This is the homework to fill out after your yaml is generated. 294 | 295 | Parameters: 296 | - yaml_str (str): The YAML string to process. 297 | 298 | Returns: 299 | - str: The modified YAML string with appended text to placeholders. 300 | """ 301 | updated_yaml = [] 302 | # Split the string into lines to process each line individually 303 | lines = yaml_str.split("\n") 304 | 305 | for line in lines: 306 | # Check if the placeholder is in the current line. 307 | # Strip the last quote to match. 308 | if line.rstrip("'").endswith(_PLACEHOLDER_COMMENT): 309 | # Replace the _PLACEHOLDER_COMMENT with itself plus the append_text 310 | updated_line = line + _FILL_OUT_TOKEN 311 | updated_yaml.append(updated_line) 312 | elif line.rstrip("'").endswith(AUTOGEN_TOKEN): 313 | updated_line = line + _AUTOGEN_COMMENT_TOKEN 314 | updated_yaml.append(updated_line) 315 | # Add comments to specific fields in certain sections. 316 | elif line.lstrip().startswith("join_type"): 317 | updated_line = line + _FILL_OUT_TOKEN + " supported: inner, left_outer" 318 | updated_yaml.append(updated_line) 319 | elif line.lstrip().startswith("relationship_type"): 320 | updated_line = ( 321 | line + _FILL_OUT_TOKEN + " supported: many_to_one, one_to_one" 322 | ) 323 | updated_yaml.append(updated_line) 324 | else: 325 | updated_yaml.append(line) 326 | 327 | # Join the lines back together into a single string 328 | return "\n".join(updated_yaml) 329 | 330 | 331 | def _to_snake_case(s: str) -> str: 332 | """ 333 | Convert a string into snake case. 334 | 335 | Parameters: 336 | s (str): The string to convert. 337 | 338 | Returns: 339 | str: The snake case version of the string. 340 | """ 341 | # Replace common delimiters with spaces 342 | s = s.replace("-", " ").replace("_", " ") 343 | 344 | words = s.split(" ") 345 | 346 | # Convert each word to lowercase and join with underscores 347 | snake_case_str = "_".join([word.lower() for word in words if word]).strip() 348 | 349 | return snake_case_str 350 | 351 | 352 | def generate_base_semantic_model_from_snowflake( 353 | base_tables: List[str], 354 | conn: SnowflakeConnection, 355 | semantic_model_name: str, 356 | n_sample_values: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL, 357 | output_yaml_path: Optional[str] = None, 358 | ) -> None: 359 | """ 360 | Generates a base semantic context from specified Snowflake tables and exports it to a YAML file. 361 | 362 | Parameters: 363 | base_tables : Fully qualified names of Snowflake tables to include in the semantic context. 364 | conn: SnowflakeConnection to reuse. 365 | snowflake_account: Identifier of the Snowflake account. 366 | semantic_model_name: The human readable model name. This should be semantically meaningful to an organization. 367 | output_yaml_path: Path for the output YAML file. If None, defaults to 'semantic_model_generator/output_models/YYYYMMDDHHMMSS_.yaml'. 368 | n_sample_values: The number of sample values to populate for all columns. 369 | 370 | Returns: 371 | None. Writes the semantic context to a YAML file. 372 | """ 373 | formatted_datetime = datetime.now().strftime("%Y%m%d%H%M%S") 374 | if not output_yaml_path: 375 | file_name = f"{formatted_datetime}_{_to_snake_case(semantic_model_name)}.yaml" 376 | if os.path.exists("semantic_model_generator/output_models"): 377 | write_path = f"semantic_model_generator/output_models/{file_name}" 378 | else: 379 | write_path = f"./{file_name}" 380 | else: # Assume user gives correct path. 381 | write_path = output_yaml_path 382 | 383 | yaml_str = generate_model_str_from_snowflake( 384 | base_tables, 385 | n_sample_values=n_sample_values if n_sample_values > 0 else 1, 386 | semantic_model_name=semantic_model_name, 387 | conn=conn, 388 | ) 389 | 390 | with open(write_path, "w") as f: 391 | # Clarify that the YAML was autogenerated and that placeholders should be filled out/deleted. 392 | f.write(_AUTOGEN_COMMENT_WARNING) 393 | f.write(yaml_str) 394 | 395 | logger.info(f"Semantic model saved to {write_path}") 396 | 397 | return None 398 | 399 | 400 | def generate_model_str_from_snowflake( 401 | base_tables: List[str], 402 | semantic_model_name: str, 403 | conn: SnowflakeConnection, 404 | n_sample_values: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL, 405 | allow_joins: Optional[bool] = False, 406 | ) -> str: 407 | """ 408 | Generates a base semantic context from specified Snowflake tables and returns the raw string. 409 | 410 | Parameters: 411 | base_tables : Fully qualified names of Snowflake tables to include in the semantic context. 412 | semantic_model_name: The human readable model name. This should be semantically meaningful to an organization. 413 | conn: SnowflakeConnection to reuse. 414 | n_sample_values: The number of sample values to populate for all columns. 415 | allow_joins: Whether to allow joins in the semantic context. 416 | 417 | Returns: 418 | str: The raw string of the semantic context. 419 | """ 420 | context = raw_schema_to_semantic_context( 421 | base_tables, 422 | n_sample_values=n_sample_values if n_sample_values > 0 else 1, 423 | semantic_model_name=semantic_model_name, 424 | allow_joins=allow_joins, 425 | conn=conn, 426 | ) 427 | # Validate the generated yaml is within context limits. 428 | # We just throw a warning here to allow users to update. 429 | validate_context_length(context) 430 | 431 | yaml_str = proto_utils.proto_to_yaml(context) 432 | # Once we have the yaml, update to include to # tokens. 433 | yaml_str = append_comment_to_placeholders(yaml_str) 434 | # Comment out the filters section as we don't have a way to auto-generate these yet. 435 | yaml_str = comment_out_section(yaml_str, "filters") 436 | yaml_str = comment_out_section(yaml_str, "relationships") 437 | 438 | return yaml_str 439 | -------------------------------------------------------------------------------- /semantic_model_generator/output_models/.keep: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/semantic_model_generator/output_models/.keep -------------------------------------------------------------------------------- /semantic_model_generator/protos/semantic_model.proto: -------------------------------------------------------------------------------- 1 | // If you make changes to this file, you'll need to run protoc to updated the 2 | // generated files by running the following command: 3 | // 4 | // python -m grpc_tools.protoc -I=semantic_model_generator/protos/ --python_out=semantic_model_generator/protos/ --pyi_out=semantic_model_generator/protos/ semantic_model_generator/protos/semantic_model.proto 5 | 6 | syntax = "proto3"; 7 | 8 | import "google/protobuf/descriptor.proto"; 9 | 10 | package semantic_model_generator; 11 | 12 | // We are using proto FieldOptions to validate the YAMLs match the expected 13 | // schema. Any fields with the `optional` option, are not required during YAML 14 | // validation. 15 | extend google.protobuf.FieldOptions { 16 | optional bool optional = 51234; 17 | optional bool sql_expression = 51235; 18 | optional bool id_field = 51236; 19 | } 20 | 21 | // AggregationType defines a list of various aggregations. 22 | enum AggregationType { 23 | aggregation_type_unknown = 0; 24 | sum = 1; 25 | avg = 2; 26 | median = 7; 27 | min = 3; 28 | max = 4; 29 | count = 5; 30 | count_distinct = 6; 31 | } 32 | 33 | // ColumnKind defines various kinds of columns, mainly categorized into 34 | // dimensions and measures. 35 | enum ColumnKind { 36 | column_kind_unknown = 0; 37 | // A column containing categorical values such as names, countries, dates. 38 | dimension = 1; 39 | // A column containing numerical values such as revenue, impressions, salary. 40 | // TODO: migrate to fact. 41 | measure = 2; 42 | // A column containing date/time data. 43 | time_dimension = 3; 44 | // A "column" containing calculations about an entity such as sum_revenue, 45 | // cvr. 46 | metric = 4; 47 | } 48 | 49 | message RetrievalResult { 50 | string value = 1; 51 | float score = 2; 52 | } 53 | 54 | // Column is analogous to a database column and defines various semantic 55 | // properties of a column. A column can either simply be a column in the base 56 | // database schema or it can be an arbitrary expression over the base schema, 57 | // e.g. `base_column1 + base_column2`. 58 | message Column { 59 | // A descriptive name for this column. 60 | string name = 1 [ (id_field) = true ]; 61 | // A list of other terms/phrases used to refer to this column. 62 | repeated string synonyms = 2 [ (optional) = true ]; 63 | // A brief description about this column, including things like what data this 64 | // column has. 65 | string description = 3 [ (optional) = true ]; 66 | // The SQL expression for this column. Could simply be a base table column 67 | // name or an arbitrary SQL expression over one or more columns of the base 68 | // table. 69 | string expr = 4 [ (sql_expression) = true ]; 70 | // The data type of this column. 71 | // TODO(nsehrawat): Consider creating an enum instead, with all snowflake 72 | // support data types. 73 | string data_type = 5; 74 | // The kind of this column - dimension or fact, metric. 75 | ColumnKind kind = 6; 76 | // If true, assume that this column has unique values. 77 | bool unique = 7 [ (optional) = true ]; 78 | // If no aggregation is specified, then this is the default aggregation 79 | // applied to this column in contxt of a grouping. 80 | AggregationType default_aggregation = 8 [ (optional) = true, deprecated = true ]; 81 | // Sample values of this column. 82 | repeated string sample_values = 9 [ (optional) = true ]; 83 | // Whether to index the values and retrieve them based on the question. 84 | // If False, all sample values will be used as input to the model. 85 | bool index_and_retrieve_values = 10 [ (optional) = true ]; 86 | // Retrieved literals of this column. 87 | repeated RetrievalResult retrieved_literals = 11 [ (optional) = true ]; 88 | 89 | // A Cortex Search Service configured on this column to retrieve literals. 90 | string cortex_search_service_name = 12 91 | [ (optional) = true, deprecated = true ]; 92 | CortexSearchService cortex_search_service = 13 [ (optional) = true ]; 93 | // If true, this column has limited possible values, all of which are in 94 | // the sample_values field. 95 | bool is_enum = 14 [ (optional) = true ]; 96 | } 97 | 98 | // Dimension columns contain categorical values (e.g. state, user_type, 99 | // platform). NOTE: If modifying this protobuf, make appropriate changes in 100 | // context_to_column_format() of snowpilot/semantic_context/protos/schema.py. 101 | message Dimension { 102 | // A descriptive name for this dimension. 103 | string name = 1 [ (id_field) = true ]; 104 | // A list of other terms/phrases used to refer to this dimension. 105 | repeated string synonyms = 2 [ (optional) = true ]; 106 | // A brief description about this dimension, including things like 107 | // what data this dimension has. 108 | string description = 3 [ (optional) = true ]; 109 | // The SQL expression defining this dimension. Could simply be a physical 110 | // column name or an arbitrary SQL expression over one or more columns of the 111 | // physical table. 112 | string expr = 4 [ (sql_expression) = true ]; 113 | // The data type of this dimension. 114 | // TODO(nsehrawat): Consider creating an enum instead with all snowflake 115 | // support data types. 116 | string data_type = 5; 117 | // If true, assume that this dimension has unique values. 118 | bool unique = 6 [ (optional) = true ]; 119 | // Sample values of this column. 120 | repeated string sample_values = 7 [ (optional) = true ]; 121 | // A Cortex Search Service configured on this column to retrieve literals. 122 | CortexSearchService cortex_search_service = 8 [ (optional) = true ]; 123 | string cortex_search_service_name = 9 124 | [ (optional) = true, deprecated = true ]; 125 | // If true, this column has limited possible values, all of which are in 126 | // the sample_values field. 127 | bool is_enum = 10 [ (optional) = true ]; 128 | } 129 | 130 | // Fully qualified Cortex Search Service name. 131 | message CortexSearchService { 132 | string database = 1 [ (optional) = true ]; 133 | string schema = 2 [ (optional) = true ]; 134 | string service = 3; 135 | string literal_column = 4 [ (optional) = true ]; 136 | } 137 | 138 | // Time dimension columns contain time values (e.g. sale_date, created_at, 139 | // year). NOTE: If modifying this protobuf, make appropriate changes in 140 | // to_column_format() of snowpilot/semantic_context/utils/utils.py. 141 | message TimeDimension { 142 | // A descriptive name for this time dimension. 143 | string name = 1 [ (id_field) = true ]; 144 | // A list of other terms/phrases used to refer to this time dimension. 145 | repeated string synonyms = 2 [ (optional) = true ]; 146 | // A brief description about this time dimension, including things like 147 | // what data it has, the timezone of values, etc. 148 | string description = 3 [ (optional) = true ]; 149 | // The SQL expression defining this time dimension. Could simply be a physical 150 | // column name or an arbitrary SQL expression over one or more columns of the 151 | // physical table. 152 | string expr = 4 [ (sql_expression) = true ]; 153 | // The data type of this time dimension. 154 | // TODO(nsehrawat): Consider creating an enum instead, with all snowflake 155 | // support data types. 156 | string data_type = 5; 157 | // If true, assume that this time dimension has unique values. 158 | bool unique = 6 [ (optional) = true ]; 159 | // Sample values of this time dimension. 160 | repeated string sample_values = 7 [ (optional) = true ]; 161 | } 162 | 163 | // Measure columns contain numerical values (e.g. revenue, impressions, salary). 164 | // NOTE: If modifying this protobuf, make appropriate changes in 165 | // to_column_format() of snowpilot/semantic_context/utils/utils.py. 166 | message Fact { 167 | // A descriptive name for this measure. 168 | string name = 1 [ (id_field) = true ]; 169 | // A list of other terms/phrases used to refer to this measure. 170 | repeated string synonyms = 2 [ (optional) = true ]; 171 | // A brief description about this measure, including things like what data 172 | // it has. 173 | string description = 3 [ (optional) = true ]; 174 | // The SQL expression defining this measure. Could simply be a physical column 175 | // name or an arbitrary SQL expression over one or more physical columns of 176 | // the underlying physical table. 177 | string expr = 4 [ (sql_expression) = true ]; 178 | // The data type of this measure. 179 | // TODO(nsehrawat): Consider creating an enum instead, with all snowflake 180 | // support data types. 181 | string data_type = 5; 182 | // If no aggregation is specified, then this is the default aggregation 183 | // applied to this measure in contxt of a grouping. 184 | AggregationType default_aggregation = 6 [ (optional) = true , deprecated = true ]; 185 | // Sample values of this measure. 186 | repeated string sample_values = 7 [ (optional) = true ]; 187 | } 188 | 189 | // Filter represents a named SQL expression that's used for filtering. 190 | // TODO: add validation. we should only support where clause style filter (no 191 | // aggregations) and reject having clauses. 192 | message NamedFilter { 193 | // A descriptive name for this filter. 194 | string name = 1; 195 | // A list of other term/phrases used to refer to this column. 196 | repeated string synonyms = 2 [ (optional) = true ]; 197 | // A brief description about this column, including details of what this 198 | // filter is typically used for. 199 | string description = 3 [ (optional) = true ]; 200 | // The SQL expression of this filter. 201 | string expr = 4 [ (sql_expression) = true ]; 202 | } 203 | 204 | // FullyQualifiedTable is used to represent three part table names - 205 | // (database, schema, table). 206 | message FullyQualifiedTable { 207 | string database = 1; 208 | string schema = 2; 209 | string table = 3; 210 | } 211 | 212 | // Defines a primary key of a table. In the general case, primary keys 213 | // are a collection of columns of the table. 214 | // For discussion: PK FK are potentially duplicative to join path in a semantic 215 | // model. However, it implies uniqueness which can be informative for getting 216 | // right aggregation level. For that reason, we are exposing only the PrimaryKey 217 | // currently. Join paths seem more extensible than foreign keys for supporting 218 | // join. Further experimentation is needed to see if JoinPath and ForeignKey can 219 | // yield similar results. 220 | message PrimaryKey { 221 | // Base column names that constitute the primary key. 222 | repeated string columns = 1; 223 | } 224 | 225 | // Defines a foreign key that references the primary key of another table. 226 | message ForeignKey { 227 | // Base column names of the foreign key table. 228 | repeated string fkey_columns = 1; 229 | // The primary key table that this foreign key references. 230 | FullyQualifiedTable pkey_table = 2; 231 | // Base column names of the primary key table. 232 | repeated string pkey_columns = 3; 233 | } 234 | 235 | // Table is analogous to a database table and provides a simple view over an 236 | // existing database table. A table can leave out some columns from the base 237 | // table and/or introduce new derived columns. 238 | message Table { 239 | // A descriptive name for this table. 240 | string name = 1 [ (id_field) = true ]; 241 | // A list of other term/phrases used to refer to this table. 242 | repeated string synonyms = 2 [ (optional) = true ]; 243 | // A brief description of this table, including details of what kinds of 244 | // analysis is it typically used for. 245 | string description = 3 [ (optional) = true ]; 246 | // Fully qualified name of the underlying base table. 247 | FullyQualifiedTable base_table = 4; 248 | 249 | // We allow two formats for specifying logical columns of a table: 250 | // 1. As a list of columns. 251 | // 2. As three separate list of dimensions, time dimensions, and measures. 252 | // For the external facing yaml specification, we have chosen to go with (2). 253 | // However, for the time being we'll support both (1) and (2) and continue 254 | // using (1) as the internal representation. 255 | repeated Column columns = 5 [ (optional) = true ]; 256 | repeated Dimension dimensions = 9 [ (optional) = true ]; 257 | repeated TimeDimension time_dimensions = 10 [ (optional) = true ]; 258 | repeated Fact measures = 11 [ (optional) = true, deprecated = true ]; 259 | repeated Fact facts = 12 [ (optional) = true ]; 260 | repeated Metric metrics = 13 [ (optional) = true ]; 261 | 262 | // Primary key of the table, if any. 263 | PrimaryKey primary_key = 6 [ (optional) = true ]; 264 | // Foreign keys of the table, if any. 265 | repeated ForeignKey foreign_keys = 7 [ (optional) = true ]; 266 | // Predefined filters on this table, if any. 267 | repeated NamedFilter filters = 8 [ (optional) = true ]; 268 | // NEXT_TAG: 14. 269 | } 270 | 271 | // Metric are named computation over a collection of columns. For now, we 272 | // only allow a metric to be defined over columns from a single table. In 273 | // future, we'll expand to allowing metrics that refer to columns from multiple 274 | // tables. 275 | message Metric { 276 | // A descriptive name of the metric. 277 | string name = 1 [ (id_field) = true ]; 278 | // A list of other term/phrases used to refer to this metric. 279 | repeated string synonyms = 2 [ (optional) = true ]; 280 | // A brief description of this metric, including details of what it computes. 281 | string description = 3 [ (optional) = true ]; 282 | // The SQL expression to compute this metric. 283 | // All columns used must be fully qualified with the logical table name. 284 | // Expression must be an aggregate 285 | string expr = 4 [ (sql_expression) = true ]; 286 | // The filter associated with this metric. 287 | // Do not expose this for now. 288 | MetricsFilter filter = 5 [ (optional) = true ]; 289 | } 290 | 291 | message MetricsFilter { string expr = 1 [ (sql_expression) = true ]; } 292 | 293 | // Type of the join - inner, left outer, etc. 294 | enum JoinType { 295 | join_type_unknown = 0; 296 | inner = 1; 297 | left_outer = 2; 298 | full_outer = 3 [ deprecated = true ]; 299 | cross = 4 [ deprecated = true ]; 300 | right_outer = 5 [ deprecated = true ]; 301 | } 302 | 303 | // Type of the relationship - one-to-one, many-to-one, etc. 304 | enum RelationshipType { 305 | relationship_type_unknown = 0; 306 | one_to_one = 1; 307 | many_to_one = 2; 308 | one_to_many = 3 [ deprecated = true ]; 309 | many_to_many = 4 [ deprecated = true ]; 310 | } 311 | 312 | message RelationKey { 313 | // Only support equi-join relationship for now. 314 | string left_column = 1; 315 | string right_column = 2; 316 | } 317 | 318 | // Relationship represents a join between two tables. 319 | message Relationship { 320 | // A unique name of the join. 321 | string name = 1; 322 | // The left hand side table of the join. 323 | string left_table = 2; 324 | // The right hand side table of the join. 325 | string right_table = 3; 326 | // The expression used to join left and right tables. Only used internally. 327 | string expr = 4 [ (sql_expression) = true, (optional) = true ]; 328 | // Keys directly represent the join relationship. 329 | repeated RelationKey relationship_columns = 7 [ (optional) = true ]; 330 | // Type of the join. 331 | JoinType join_type = 5; 332 | // Type of the relationship. 333 | RelationshipType relationship_type = 6; 334 | } 335 | 336 | // A message that encapsulates custom instructions for each module. 337 | message ModuleCustomInstructions { 338 | // Custom instructions for SQL Generation. 339 | string sql_generation = 1 [ (optional) = true ]; 340 | // Custom instructions for Question Categorization. 341 | string question_categorization = 2 [ (optional) = true ]; 342 | } 343 | 344 | // The semantic context relevant to generating SQL for answering a data 345 | // question. 346 | message SemanticModel { 347 | // A descriptive name of the project. 348 | string name = 1; 349 | // A brief description of this project, including details of what kind of 350 | // analysis does this project enable. 351 | string description = 2 [ (optional) = true ]; 352 | // List of tables in this project. 353 | repeated Table tables = 3; 354 | // List of relationships in this project. 355 | repeated Relationship relationships = 5 [ (optional) = true ]; 356 | // List of verified queries for this semantic model. 357 | repeated VerifiedQuery verified_queries = 6 [ (optional) = true ]; 358 | // Custom instructions that will be applied to the final SQL generation. 359 | string custom_instructions = 7 [ (optional) = true ]; 360 | // Module-specific custom instructions. The SQL generation instruction here 361 | // will take precedence over the legacy custom_instructions if it exists. 362 | ModuleCustomInstructions module_custom_instructions = 8 [ (optional) = true ]; 363 | } 364 | 365 | // VerifiedQuery represents a (question, sql) pair that has been manually 366 | // verified (e.g. by an analyst) to be correct. 367 | message VerifiedQuery { 368 | // A name for this verified query. Mainly used for display purposes. 369 | string name = 1; 370 | // The name of the semantic model on which this verified query is based off. 371 | string semantic_model_name = 2 [ (optional) = true ]; 372 | // The question being answered. 373 | string question = 3; 374 | // The correct SQL query for answering the question. 375 | string sql = 4 [ (sql_expression) = true ]; 376 | // Timestamp at which the query was last verified - measures in seconds since 377 | // epoch, in UTC. 378 | int64 verified_at = 5 [ (optional) = true ]; 379 | // Name of the person who verified this query. 380 | string verified_by = 6 [ (optional) = true ]; 381 | // Whether to always include in this question in the suggested questions 382 | // module 383 | bool use_as_onboarding_question = 7 [ (optional) = true ]; 384 | } 385 | 386 | // VerifiedQueryRepository is a simply a collection of verified queries. 387 | message VerifiedQueryRepository { repeated VerifiedQuery verified_queries = 1; } 388 | -------------------------------------------------------------------------------- /semantic_model_generator/snowflake_utils/env_vars.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from dotenv import load_dotenv 4 | 5 | load_dotenv(override=True) 6 | DEFAULT_SESSION_TIMEOUT_SEC = int(os.environ.get("SNOWFLAKE_SESSION_TIMEOUT_SEC", 120)) 7 | SNOWFLAKE_ROLE = os.getenv("SNOWFLAKE_ROLE") 8 | SNOWFLAKE_WAREHOUSE = os.getenv("SNOWFLAKE_WAREHOUSE") 9 | SNOWFLAKE_USER = os.getenv("SNOWFLAKE_USER") 10 | SNOWFLAKE_PASSWORD = os.getenv("SNOWFLAKE_PASSWORD") 11 | SNOWFLAKE_HOST = os.getenv("SNOWFLAKE_HOST") 12 | SNOWFLAKE_AUTHENTICATOR = os.getenv("SNOWFLAKE_AUTHENTICATOR") 13 | SNOWFLAKE_ACCOUNT_LOCATOR = os.getenv("SNOWFLAKE_ACCOUNT_LOCATOR") 14 | 15 | # Optional MFA environment variables 16 | SNOWFLAKE_MFA_PASSCODE = os.getenv("SNOWFLAKE_MFA_PASSCODE") 17 | SNOWFLAKE_MFA_PASSCODE_IN_PASSWORD = os.getenv("SNOWFLAKE_MFA_PASSCODE_IN_PASSWORD") 18 | 19 | 20 | def assert_required_env_vars() -> list[str]: 21 | """ 22 | Ensures that the required environment variables are set before proceeding. 23 | Returns: list of missing required environment variables 24 | 25 | """ 26 | 27 | missing_env_vars = [] 28 | if not SNOWFLAKE_ROLE: 29 | missing_env_vars.append("SNOWFLAKE_ROLE") 30 | if not SNOWFLAKE_WAREHOUSE: 31 | missing_env_vars.append("SNOWFLAKE_WAREHOUSE") 32 | if not SNOWFLAKE_USER: 33 | missing_env_vars.append("SNOWFLAKE_USER") 34 | if not SNOWFLAKE_ACCOUNT_LOCATOR: 35 | missing_env_vars.append("SNOWFLAKE_ACCOUNT_LOCATOR") 36 | if not SNOWFLAKE_HOST: 37 | missing_env_vars.append("SNOWFLAKE_HOST") 38 | if not SNOWFLAKE_PASSWORD and not SNOWFLAKE_AUTHENTICATOR: 39 | missing_env_vars.append("SNOWFLAKE_PASSWORD/SNOWFLAKE_AUTHENTICATOR") 40 | 41 | # Assert that SNOWFLAKE_PASSWORD is required unless the user is using the externalbrowser authenticator 42 | if ( 43 | SNOWFLAKE_AUTHENTICATOR 44 | and SNOWFLAKE_AUTHENTICATOR.lower() != "externalbrowser" 45 | and not SNOWFLAKE_PASSWORD 46 | ): 47 | missing_env_vars.append("SNOWFLAKE_PASSWORD") 48 | 49 | return missing_env_vars 50 | -------------------------------------------------------------------------------- /semantic_model_generator/snowflake_utils/utils.py: -------------------------------------------------------------------------------- 1 | from typing import Dict, Optional, Union 2 | 3 | from snowflake.connector import connect 4 | from snowflake.connector.connection import SnowflakeConnection 5 | 6 | from semantic_model_generator.data_processing.data_types import FQNParts 7 | 8 | 9 | def create_fqn_table(fqn_str: str) -> FQNParts: 10 | if fqn_str.count(".") != 2: 11 | raise ValueError( 12 | "Expected to have a table fully qualified name following the {database}.{schema}.{table} format." 13 | + f"Instead found {fqn_str}" 14 | ) 15 | database, schema, table = fqn_str.split(".") 16 | return FQNParts( 17 | database=database.upper(), schema_name=schema.upper(), table=table.upper() 18 | ) 19 | 20 | 21 | def create_connection_parameters( 22 | user: str, 23 | account: str, 24 | password: Optional[str] = None, 25 | host: Optional[str] = None, 26 | role: Optional[str] = None, 27 | warehouse: Optional[str] = None, 28 | database: Optional[str] = None, 29 | schema: Optional[str] = None, 30 | authenticator: Optional[str] = None, 31 | passcode: Optional[str] = None, 32 | passcode_in_password: Optional[bool] = None, 33 | ) -> Dict[str, Union[str, bool]]: 34 | connection_parameters: Dict[str, Union[str, bool]] = dict( 35 | user=user, account=account 36 | ) 37 | if password: 38 | connection_parameters["password"] = password 39 | if role: 40 | connection_parameters["role"] = role 41 | if warehouse: 42 | connection_parameters["warehouse"] = warehouse 43 | if database: 44 | connection_parameters["database"] = database 45 | if schema: 46 | connection_parameters["schema"] = schema 47 | if authenticator: 48 | connection_parameters["authenticator"] = authenticator 49 | if host: 50 | connection_parameters["host"] = host 51 | if passcode: 52 | connection_parameters["passcode"] = passcode 53 | if passcode_in_password: 54 | connection_parameters["passcode_in_password"] = passcode_in_password 55 | return connection_parameters 56 | 57 | 58 | def _connection( 59 | connection_parameters: Dict[str, Union[str, bool]] 60 | ) -> SnowflakeConnection: 61 | # https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect 62 | return connect(**connection_parameters) 63 | 64 | 65 | def snowflake_connection( 66 | user: str, 67 | account: str, 68 | role: str, 69 | warehouse: str, 70 | password: Optional[str] = None, 71 | host: Optional[str] = None, 72 | authenticator: Optional[str] = None, 73 | passcode: Optional[str] = None, 74 | passcode_in_password: Optional[bool] = None, 75 | ) -> SnowflakeConnection: 76 | """ 77 | Returns a Snowflake Connection to the specified account. 78 | """ 79 | return _connection( 80 | create_connection_parameters( 81 | user=user, 82 | password=password, 83 | host=host, 84 | account=account, 85 | role=role, 86 | warehouse=warehouse, 87 | authenticator=authenticator, 88 | passcode=passcode, 89 | passcode_in_password=passcode_in_password, 90 | ) 91 | ) 92 | -------------------------------------------------------------------------------- /semantic_model_generator/tests/snowflake_connector_test.py: -------------------------------------------------------------------------------- 1 | from unittest import mock 2 | from unittest.mock import MagicMock, call, patch 3 | 4 | import pandas as pd 5 | import pytest 6 | from pandas.testing import assert_frame_equal 7 | 8 | from semantic_model_generator.data_processing.data_types import Column, Table 9 | from semantic_model_generator.snowflake_utils import snowflake_connector 10 | 11 | 12 | @pytest.fixture 13 | def mock_snowflake_connection_env(monkeypatch): 14 | # Mock environment variable 15 | monkeypatch.setenv("SNOWFLAKE_HOST", "test_host") 16 | 17 | # Use this fixture to also patch instance methods if needed 18 | with patch.object( 19 | snowflake_connector.SnowflakeConnector, "_get_user", return_value="test_user" 20 | ), patch.object( 21 | snowflake_connector.SnowflakeConnector, 22 | "_get_password", 23 | return_value="test_password", 24 | ), patch.object( 25 | snowflake_connector.SnowflakeConnector, "_get_role", return_value="test_role" 26 | ), patch.object( 27 | snowflake_connector.SnowflakeConnector, 28 | "_get_warehouse", 29 | return_value="test_warehouse", 30 | ), patch.object( 31 | snowflake_connector.SnowflakeConnector, "_get_host", return_value="test_host" 32 | ): 33 | yield 34 | 35 | 36 | @pytest.fixture 37 | def schemas_tables_columns() -> pd.DataFrame: 38 | return pd.DataFrame( 39 | columns=[ 40 | "TABLE_SCHEMA", 41 | "TABLE_NAME", 42 | "COLUMN_NAME", 43 | "DATA_TYPE", 44 | "COLUMN_COMMENT", 45 | ], 46 | data=[ 47 | ["TEST_SCHEMA_1", "table_1", "col_1", "VARCHAR", None], 48 | ["TEST_SCHEMA_1", "table_1", "col_2", "NUMBER", None], 49 | ["TEST_SCHEMA_1", "table_2", "col_1", "NUMBER", "table_2_col_1_comment"], 50 | [ 51 | "TEST_SCHEMA_1", 52 | "table_2", 53 | "col_2", 54 | "TIMESTAMP_NTZ", 55 | "table_2_col_2_comment", 56 | ], 57 | ["TEST_SCHEMA_2", "table_3", "col_1", "VARIANT", None], 58 | [ 59 | "TEST_SCHEMA_2", 60 | "invalid_table", 61 | "col_1", 62 | "VARIANT", 63 | "invalid_table_col_1_comment", 64 | ], 65 | ], 66 | ) 67 | 68 | 69 | @pytest.fixture 70 | def valid_tables() -> pd.DataFrame: 71 | return pd.DataFrame( 72 | columns=["TABLE_SCHEMA", "TABLE_NAME", "TABLE_COMMENT"], 73 | data=[ 74 | ["TEST_SCHEMA_1", "table_1", None], 75 | ["TEST_SCHEMA_1", "table_2", "table_2_comment"], 76 | ["TEST_SCHEMA_2", "table_3", "table_3_comment"], 77 | ], 78 | ) 79 | 80 | 81 | _TEST_TABLE_ONE = Table( 82 | id_=0, 83 | name="table_1", 84 | columns=[ 85 | Column( 86 | id_=0, 87 | column_name="col_1", 88 | column_type="text", 89 | is_primary_key=True, 90 | is_foreign_key=False, 91 | ), 92 | Column( 93 | id_=1, 94 | column_name="col_2", 95 | column_type="number", 96 | is_primary_key=False, 97 | is_foreign_key=False, 98 | ), 99 | ], 100 | ) 101 | 102 | 103 | @mock.patch( 104 | "semantic_model_generator.snowflake_utils.snowflake_connector.snowflake_connection" 105 | ) 106 | def test_connect( 107 | mock_snowflake_connection: mock.MagicMock, mock_snowflake_connection_env 108 | ): 109 | mock_snowflake_connection.return_value = mock.MagicMock() 110 | 111 | connector = snowflake_connector.SnowflakeConnector(account_name="test_account") 112 | with connector.connect(db_name="test") as conn: 113 | pass 114 | 115 | conn.cursor().execute.assert_has_calls( 116 | [ 117 | call("ALTER SESSION SET QUERY_TAG = 'SEMANTIC_MODEL_GENERATOR'"), 118 | call("ALTER SESSION SET STATEMENT_TIMEOUT_IN_SECONDS = 120"), 119 | ] 120 | ) 121 | conn.close.assert_called_with() 122 | 123 | 124 | @mock.patch( 125 | "semantic_model_generator.snowflake_utils.snowflake_connector.snowflake_connection" 126 | ) 127 | def test_connect_with_schema( 128 | mock_snowflake_connection: mock.MagicMock, mock_snowflake_connection_env 129 | ): 130 | mock_snowflake_connection.return_value = mock.MagicMock() 131 | 132 | connector = snowflake_connector.SnowflakeConnector( 133 | account_name="test_account", 134 | ) 135 | with connector.connect(db_name="test_db", schema_name="test_schema") as conn: 136 | pass 137 | 138 | conn.cursor().execute.assert_has_calls( 139 | [ 140 | call("ALTER SESSION SET QUERY_TAG = 'SEMANTIC_MODEL_GENERATOR'"), 141 | call("ALTER SESSION SET STATEMENT_TIMEOUT_IN_SECONDS = 120"), 142 | ] 143 | ) 144 | conn.close.assert_called_with() 145 | 146 | 147 | @mock.patch( 148 | "semantic_model_generator.snowflake_utils.snowflake_connector._fetch_valid_tables_and_views" 149 | ) 150 | @mock.patch( 151 | "semantic_model_generator.snowflake_utils.snowflake_connector.snowflake_connection" 152 | ) 153 | def test_get_valid_schema_table_columns_df( 154 | mock_snowflake_connection: mock.MagicMock, 155 | mock_valid_tables: mock.MagicMock, 156 | valid_tables: pd.DataFrame, 157 | schemas_tables_columns: pd.DataFrame, 158 | ): 159 | mock_conn = mock.MagicMock() 160 | # We expect get_database_representation() to execute queries in this order: 161 | # - select from information_schema.tables 162 | # - select from information_schema.columns for each table. 163 | mock_conn.cursor().execute().fetch_pandas_all.side_effect = [ 164 | schemas_tables_columns[schemas_tables_columns["TABLE_NAME"] == "table_1"] 165 | ] 166 | mock_snowflake_connection.return_value = mock_conn 167 | mock_valid_tables.return_value = valid_tables 168 | 169 | got = snowflake_connector.get_valid_schemas_tables_columns_df( 170 | mock_conn, "TEST_DB", "TEST_SCHEMA_1", ["table_1"] 171 | ) 172 | 173 | want_data = { 174 | "TABLE_SCHEMA": ["TEST_SCHEMA_1", "TEST_SCHEMA_1"], 175 | "TABLE_NAME": ["table_1", "table_1"], 176 | "TABLE_COMMENT": [None, None], 177 | "COLUMN_NAME": ["col_1", "col_2"], 178 | "DATA_TYPE": ["VARCHAR", "NUMBER"], 179 | "COLUMN_COMMENT": [None, None], 180 | } 181 | 182 | # Create a DataFrame 183 | want = pd.DataFrame(want_data) 184 | 185 | assert_frame_equal(want, got) 186 | 187 | # Assert that the connection executed the expected queries. 188 | query = "select t.TABLE_SCHEMA, t.TABLE_NAME, c.COLUMN_NAME, c.DATA_TYPE, c.COMMENT as COLUMN_COMMENT\nfrom TEST_DB.information_schema.tables as t\njoin TEST_DB.information_schema.columns as c on t.table_schema = c.table_schema and t.table_name = c.table_name where t.table_schema ilike 'TEST_SCHEMA_1' AND LOWER(t.table_name) in ('table_1') \norder by 1, 2, c.ordinal_position" 189 | mock_conn.cursor().execute.assert_any_call(query) 190 | 191 | 192 | @pytest.fixture 193 | def snowflake_data(): 194 | return [ 195 | # This mimics the return value of cursor.fetchall() for tables and views 196 | ([("table1", "schema1", "A table comment")], [("column1", "dtype")]), 197 | ([("view1", "schema1", "A view comment")], [("column1", "dtype")]), 198 | ] 199 | 200 | 201 | @pytest.fixture 202 | def expected_df(): 203 | # Expected DataFrame structure based on mocked fetchall data 204 | return pd.DataFrame( 205 | { 206 | snowflake_connector._TABLE_NAME_COL: ["table1", "view1"], 207 | snowflake_connector._TABLE_SCHEMA_COL: ["schema1", "schema1"], 208 | snowflake_connector._TABLE_COMMENT_COL: [ 209 | "A table comment", 210 | "A view comment", 211 | ], 212 | } 213 | ) 214 | 215 | 216 | def test_fetch_valid_tables_and_views(snowflake_data, expected_df): 217 | # Mock SnowflakeConnection and cursor 218 | mock_conn = mock.MagicMock() 219 | mock_cursor = mock_conn.cursor.return_value 220 | mock_cursor.execute.return_value = mock_cursor 221 | # Set side effects for fetchall and description based on snowflake_data fixture 222 | mock_cursor.fetchall.side_effect = [snowflake_data[0][0], snowflake_data[1][0]] 223 | 224 | mock_name_one = MagicMock() 225 | mock_name_one.name = "name" 226 | mock_name_two = MagicMock() 227 | mock_name_two.name = "schema_name" 228 | mock_name_three = MagicMock() 229 | mock_name_three.name = "comment" 230 | 231 | mocked_descriptions = [mock_name_one, mock_name_two, mock_name_three] 232 | mock_cursor.description = mocked_descriptions 233 | 234 | # Call the function to test 235 | result_df = snowflake_connector._fetch_valid_tables_and_views(mock_conn, "mock_db") 236 | 237 | # Assert the result is as expected 238 | pd.testing.assert_frame_equal( 239 | result_df.reset_index(drop=True), expected_df.reset_index(drop=True) 240 | ) 241 | 242 | # Verify execute was called with correct queries 243 | mock_cursor.execute.assert_any_call("show tables in database mock_db") 244 | mock_cursor.execute.assert_any_call("show views in database mock_db") 245 | -------------------------------------------------------------------------------- /semantic_model_generator/tests/utils_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | 3 | from semantic_model_generator.data_processing.data_types import FQNParts 4 | from semantic_model_generator.snowflake_utils.utils import create_fqn_table 5 | 6 | 7 | def test_fqn_creation(): 8 | input_name = "database.schema.table" 9 | 10 | fqn_parts = create_fqn_table(input_name) 11 | 12 | assert fqn_parts == FQNParts( 13 | database="DATABASE", schema_name="SCHEMA", table="table" 14 | ) 15 | 16 | 17 | def test_fqn_creation_invalid_name(): 18 | input_name = "database.schema table" 19 | with pytest.raises(ValueError): 20 | create_fqn_table(input_name) 21 | -------------------------------------------------------------------------------- /semantic_model_generator/tests/validate_model_test.py: -------------------------------------------------------------------------------- 1 | import json 2 | from unittest.mock import MagicMock, patch 3 | 4 | from snowflake.connector import SnowflakeConnection 5 | 6 | from semantic_model_generator.validate_model import validate 7 | 8 | 9 | @patch("semantic_model_generator.validate_model.send_message") 10 | def test_validate_success(mock_send_message): 11 | # Mock the response from send_message to simulate a successful response 12 | mock_send_message.return_value = {} 13 | 14 | # Call the validate function 15 | conn = MagicMock(spec=SnowflakeConnection) 16 | yaml_str = "valid_yaml_content" 17 | result = validate(yaml_str, conn) 18 | 19 | assert result is None 20 | 21 | 22 | @patch("semantic_model_generator.validate_model.send_message") 23 | def test_validate_error(mock_send_message): 24 | # Mock the response from send_message to simulate an error response 25 | mock_send_message.return_value = { 26 | "error": json.dumps( 27 | { 28 | "message": "This YAML is missing a name. Please use https://github.com/Snowflake-Labs/semantic-model-generator.*" 29 | } 30 | ) 31 | } 32 | 33 | # Call the validate function and assert that it raises a ValueError 34 | conn = MagicMock(spec=SnowflakeConnection) 35 | yaml_str = "invalid_yaml_content" 36 | try: 37 | validate(yaml_str, conn) 38 | except ValueError as e: 39 | # Verify that the error message is as expected 40 | assert str(e) == "This YAML is missing a name." 41 | -------------------------------------------------------------------------------- /semantic_model_generator/tests/yaml_to_semantic_model_test.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | from strictyaml import YAMLValidationError 3 | 4 | from semantic_model_generator.data_processing.proto_utils import yaml_to_semantic_model 5 | 6 | 7 | def test_valid_yaml(): 8 | yaml = """ 9 | name: jaffle_shop 10 | tables: 11 | - name: orders 12 | description: Order overview data mart, offering key details for each order including 13 | if it's a customer's first order and a food vs. drink item breakdown. One row 14 | per order. 15 | base_table: 16 | database: autosql_dataset_dbt_jaffle_shop 17 | schema: data 18 | table: orders 19 | filters: 20 | - name: large_order 21 | expr: cogs > 100 22 | - name: custom_filter 23 | expr: my_udf(col1, col2) 24 | - name: window_func 25 | expr: COUNT(i) OVER (PARTITION BY p ORDER BY o) count_i_Range_Pre 26 | """ 27 | assert yaml_to_semantic_model(yaml) is not None 28 | 29 | 30 | def test_invalid_sql(): 31 | yaml = """ 32 | name: jaffle_shop 33 | tables: 34 | - name: orders 35 | description: Order overview data mart, offering key details for each order including 36 | if it's a customer's first order and a food vs. drink item breakdown. One row 37 | per order. 38 | base_table: 39 | database: autosql_dataset_dbt_jaffle_shop 40 | schema: data 41 | table: orders 42 | filters: 43 | - name: large_order 44 | expr: (cogs > 100 45 | """ 46 | with pytest.raises(YAMLValidationError, match=r".*invalid SQL expression.*"): 47 | yaml_to_semantic_model(yaml) 48 | 49 | 50 | def test_required_field_missing(): 51 | yaml = """ 52 | name: jaffle_shop 53 | tables: 54 | - name: orders 55 | description: Order overview data mart, offering key details for each order including 56 | if it's a customer's first order and a food vs. drink item breakdown. One row 57 | per order. 58 | base_table: 59 | database: autosql_dataset_dbt_jaffle_shop 60 | schema: data 61 | """ 62 | with pytest.raises( 63 | YAMLValidationError, match=r".*required key.*table.*not found.*" 64 | ): 65 | yaml_to_semantic_model(yaml) 66 | 67 | 68 | def test_non_string_sample_value(): 69 | yaml = """ 70 | name: jaffle_shop 71 | tables: 72 | - name: orders 73 | description: Order overview data mart, offering key details for each order including 74 | if it's a customer's first order and a food vs. drink item breakdown. One row 75 | per order. 76 | base_table: 77 | database: autosql_dataset_dbt_jaffle_shop 78 | schema: data 79 | table: orders 80 | columns: 81 | - name: order_id 82 | expr: order_id 83 | data_type: TEXT 84 | kind: dimension 85 | unique: true 86 | sample_values: 87 | - yes 88 | - 1 89 | - 05-17-2024 90 | """ 91 | ctx = yaml_to_semantic_model(yaml) 92 | for sample_value in ctx.tables[0].columns[0].sample_values: 93 | assert isinstance(sample_value, str) 94 | -------------------------------------------------------------------------------- /semantic_model_generator/validate/context_length.py: -------------------------------------------------------------------------------- 1 | import copy 2 | from typing import Any, TypeVar 3 | 4 | from google.protobuf.message import Message 5 | from loguru import logger 6 | 7 | from semantic_model_generator.data_processing.proto_utils import proto_to_yaml 8 | from semantic_model_generator.protos import semantic_model_pb2 9 | 10 | # Max number of sample values we include in the semantic model representation. 11 | _MAX_SAMPLE_VALUES = 3 12 | 13 | ProtoMsg = TypeVar("ProtoMsg", bound=Message) 14 | 15 | # Max total tokens is 32800. 16 | # We reserve 500 tokens for response (average response is 300 tokens). 17 | # So the prompt token limit is 32300. 18 | # We reserve 1220 tokens for model instructions, separate from the semantic model. 19 | # Thus, the semantic model will get about 31080 tokens, 20 | # with some more discounting for retrieved literals. 21 | _TOTAL_PROMPT_TOKEN_LIMIT = 32300 22 | _BASE_INSTRUCTION_TOKEN_LENGTH = 1220 23 | # Estimated 10 tokens per literals since each literal is presented as a filter expression 24 | # (i.e. table.column = 'literal'). 25 | # Currently 10 literals are retrieved per search. 26 | _TOKENS_PER_LITERAL = 10 27 | _NUM_LITERAL_RETRIEVALS = 10 28 | 29 | # As per https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them 30 | _CHARS_PER_TOKEN = 4 31 | 32 | 33 | def _get_field(msg: ProtoMsg, field_name: str) -> Any: 34 | fields = [value for fd, value in msg.ListFields() if fd.name == field_name] 35 | if not fields: 36 | return None 37 | return fields[0] 38 | 39 | 40 | def _count_search_services(model: ProtoMsg) -> int: 41 | cnt = 0 42 | tables = _get_field(model, "tables") 43 | if not tables: 44 | return 0 45 | 46 | for table in tables: 47 | dimensions = _get_field(table, "dimensions") 48 | if not dimensions: 49 | continue 50 | for dimension in dimensions: 51 | if _get_field(dimension, "cortex_search_service_name"): 52 | cnt += 1 53 | return cnt 54 | 55 | 56 | def validate_context_length( 57 | model_orig: semantic_model_pb2.SemanticModel, throw_error: bool = False 58 | ) -> None: 59 | """ 60 | Validate the token limit for the model with space for the prompt. 61 | 62 | yaml_model: The yaml semantic model 63 | throw_error: Should this function throw an error or just a warning. 64 | """ 65 | # When counting tokens, we need to remove the verified_queries field and additional sample values. Make a copy for counting. 66 | model = copy.deepcopy(model_orig) 67 | model.ClearField("verified_queries") 68 | # Also clear all the dimensional sample values, as we'll retrieve those into filters by default. 69 | for t in model.tables: 70 | for dim in t.dimensions: 71 | del dim.sample_values[_MAX_SAMPLE_VALUES:] 72 | 73 | num_search_services = _count_search_services(model) 74 | 75 | yaml_str = proto_to_yaml(model) 76 | # Pass in the str version of the semantic context yaml. 77 | # This isn't exactly how many tokens the model will be, but should roughly be correct. 78 | literals_buffer = ( 79 | _TOKENS_PER_LITERAL * _NUM_LITERAL_RETRIEVALS * (1 + num_search_services) 80 | ) 81 | approx_instruction_length = _BASE_INSTRUCTION_TOKEN_LENGTH + literals_buffer 82 | model_tokens_limit = _TOTAL_PROMPT_TOKEN_LIMIT - approx_instruction_length 83 | model_tokens = len(yaml_str) // _CHARS_PER_TOKEN 84 | if model_tokens > model_tokens_limit: 85 | tokens_to_remove = model_tokens - model_tokens_limit 86 | chars_to_remove = tokens_to_remove * _CHARS_PER_TOKEN 87 | if throw_error: 88 | raise ValueError( 89 | f"Your semantic model is too large. " 90 | f"Passed size is {len(yaml_str)} characters. " 91 | f"We need you to remove {chars_to_remove} characters in your semantic model. " 92 | f"Please check: \n" 93 | f" (1) If you have long descriptions that can be truncated. \n" 94 | f" (2) If you can remove some columns that are not used within your tables. \n" 95 | f" (3) If you have extra tables you do not need." 96 | ) 97 | else: 98 | logger.warning( 99 | f"WARNING 🚨: The Semantic model is too large. \n" 100 | f"Passed size is {len(yaml_str)} characters. " 101 | f"We need you to remove {chars_to_remove} characters in your semantic model. " 102 | f"Please check: \n" 103 | f" (1) If you have long descriptions that can be truncated. \n" 104 | f" (2) If you can remove some columns that are not used within your tables. \n" 105 | f" (3) If you have extra tables you do not need. \n" 106 | f" Once you've finished updating, please validate your semantic model." 107 | ) 108 | -------------------------------------------------------------------------------- /semantic_model_generator/validate/keywords.py: -------------------------------------------------------------------------------- 1 | # https://docs.snowflake.com/en/sql-reference/reserved-keywords 2 | SF_RESERVED_WORDS = { 3 | "ACCOUNT", 4 | "ALL", 5 | "ALTER", 6 | "AND", 7 | "ANY", 8 | "AS", 9 | "BETWEEN", 10 | "BY", 11 | "CASE", 12 | "CAST", 13 | "CHECK", 14 | "COLUMN", 15 | "CONNECT", 16 | "CONNECTION", 17 | "CONSTRAINT", 18 | "CREATE", 19 | "CROSS", 20 | "CURRENT", 21 | "CURRENT_DATE", 22 | "CURRENT_TIME", 23 | "CURRENT_TIMESTAMP", 24 | "CURRENT_USER", 25 | "DATABASE", 26 | "DELETE", 27 | "DISTINCT", 28 | "DROP", 29 | "ELSE", 30 | "EXISTS", 31 | "FALSE", 32 | "FOLLOWING", 33 | "FOR", 34 | "FROM", 35 | "FULL", 36 | "GRANT", 37 | "GROUP", 38 | "GSCLUSTER", 39 | "HAVING", 40 | "ILIKE", 41 | "IN", 42 | "INCREMENT", 43 | "INNER", 44 | "INSERT", 45 | "INTERSECT", 46 | "INTO", 47 | "IS", 48 | "ISSUE", 49 | "JOIN", 50 | "LATERAL", 51 | "LEFT", 52 | "LIKE", 53 | "LOCALTIME", 54 | "LOCALTIMESTAMP", 55 | "MINUS", 56 | "NATURAL", 57 | "NOT", 58 | "NULL", 59 | "OF", 60 | "ON", 61 | "OR", 62 | "ORDER", 63 | "ORGANIZATION", 64 | "QUALIFY", 65 | "REGEXP", 66 | "REVOKE", 67 | "RIGHT", 68 | "RLIKE", 69 | "ROW", 70 | "ROWS", 71 | "SAMPLE", 72 | "SCHEMA", 73 | "SELECT", 74 | "SET", 75 | "SOME", 76 | "START", 77 | "TABLE", 78 | "TABLESAMPLE", 79 | "THEN", 80 | "TO", 81 | "TRIGGER", 82 | "TRUE", 83 | "TRY_CAST", 84 | "UNION", 85 | "UNIQUE", 86 | "UPDATE", 87 | "USING", 88 | "VALUES", 89 | "VIEW", 90 | "WHEN", 91 | "WHENEVER", 92 | "WHERE", 93 | "WITH", 94 | } 95 | -------------------------------------------------------------------------------- /semantic_model_generator/validate/schema.py: -------------------------------------------------------------------------------- 1 | # This file is essentially doing DFS in the protobuf Descriptors and storing in the SCHEMA. We start with as the root 2 | # SemanticModel at the bottom of this file. This will automatically pickup any changes to the protobuf (given you run 3 | # the protoc command before to regenerate the python files. Different proto messages can have the same message type as a 4 | # child, so we keep a dict of precomputed types to avoid double computing. This currently does not support cycles in the 5 | # proto definition, but we can add a visited set to this if we ever need to. 6 | 7 | 8 | from typing import Dict 9 | 10 | import sqlglot 11 | from google.protobuf.descriptor import Descriptor, EnumDescriptor, FieldDescriptor 12 | from strictyaml import ( 13 | Bool, 14 | Decimal, 15 | Enum, 16 | Int, 17 | Map, 18 | Optional, 19 | Seq, 20 | Str, 21 | Validator, 22 | YAMLValidationError, 23 | ) 24 | 25 | from semantic_model_generator.protos import semantic_model_pb2 26 | from semantic_model_generator.validate.keywords import SF_RESERVED_WORDS 27 | 28 | scalar_type_map = { 29 | FieldDescriptor.TYPE_BOOL: Bool, 30 | FieldDescriptor.TYPE_STRING: Str, 31 | FieldDescriptor.TYPE_DOUBLE: Decimal, 32 | FieldDescriptor.TYPE_FLOAT: Decimal, 33 | FieldDescriptor.TYPE_INT32: Int, 34 | FieldDescriptor.TYPE_INT64: Int, 35 | } 36 | 37 | 38 | class SqlExpression(Str): # type: ignore 39 | def validate_scalar(self, chunk): # type: ignore 40 | try: 41 | sqlglot.parse_one(chunk.contents, dialect=sqlglot.dialects.Snowflake) # type: ignore 42 | except Exception: 43 | chunk.expecting_but_found("", "invalid SQL expression") 44 | return chunk.contents 45 | 46 | 47 | class IdField(Str): # type: ignore 48 | def validate_scalar(self, chunk): # type: ignore 49 | if not chunk.contents.replace("_", "").replace("$", "").isalnum(): 50 | chunk.expecting_but_found( 51 | "", 52 | "name can only contain letters, underscores, decimal digits (0-9), and dollar signs ($).", 53 | ) 54 | if chunk.contents.upper() in SF_RESERVED_WORDS: 55 | chunk.expecting_but_found("", "name cannot be a Snowflake reserved keyword") 56 | return chunk.contents 57 | 58 | 59 | class VerifiedQueries(Seq): # type: ignore 60 | """ 61 | Validator for the verified_queries field. 62 | We ensure that there are no duplicate verified queries, by checking for duplicate (question, sql) pairs. 63 | """ 64 | 65 | def validate(self, chunk): # type: ignore 66 | super().validate(chunk) 67 | seen_queries = set() 68 | for query in chunk.contents: 69 | qa_pair = (query["question"], query["sql"]) 70 | if qa_pair in seen_queries: 71 | raise YAMLValidationError( 72 | context="Duplicate verified query found.", 73 | problem=query["name"], 74 | chunk=chunk, 75 | ) 76 | seen_queries.add(qa_pair) 77 | 78 | 79 | def create_schema_for_message( 80 | message: Descriptor, precomputed_types: Dict[str, Validator] 81 | ) -> Validator: 82 | if message.name in precomputed_types: 83 | return precomputed_types[message.name] 84 | message_schema = {} 85 | for k, v in message.fields_by_name.items(): 86 | if _is_optional_field(v): 87 | message_schema[Optional(k)] = create_schema_for_field(v, precomputed_types) 88 | else: 89 | message_schema[k] = create_schema_for_field(v, precomputed_types) 90 | schema = Map(message_schema) 91 | precomputed_types[message.name] = schema 92 | return schema 93 | 94 | 95 | def create_schema_for_field( 96 | field_descriptor: FieldDescriptor, precomputed_types: Dict[str, Validator] 97 | ) -> Validator: 98 | if field_descriptor.type == FieldDescriptor.TYPE_MESSAGE: 99 | field_type = create_schema_for_message( 100 | field_descriptor.message_type, precomputed_types 101 | ) 102 | elif field_descriptor.type == FieldDescriptor.TYPE_ENUM: 103 | field_type = create_schema_for_enum( 104 | field_descriptor.enum_type, precomputed_types 105 | ) 106 | elif field_descriptor.type == FieldDescriptor.TYPE_STRING and _is_sql_expression( 107 | field_descriptor 108 | ): 109 | field_type = SqlExpression() 110 | elif field_descriptor.type == FieldDescriptor.TYPE_STRING and _is_id_field( 111 | field_descriptor 112 | ): 113 | field_type = IdField() 114 | elif field_descriptor.type in scalar_type_map: 115 | field_type = scalar_type_map[field_descriptor.type]() 116 | else: 117 | raise Exception(f"unsupported type: {field_descriptor.type}") 118 | 119 | if field_descriptor.label == FieldDescriptor.LABEL_REPEATED: 120 | if field_descriptor.name == "verified_queries": 121 | field_type = VerifiedQueries(field_type) 122 | else: 123 | field_type = Seq(field_type) 124 | 125 | return field_type 126 | 127 | 128 | def _is_optional_field(field_descriptor: FieldDescriptor) -> bool: 129 | return _has_field_option(field_descriptor, "optional") 130 | 131 | 132 | def _is_sql_expression(field_descriptor: FieldDescriptor) -> bool: 133 | return _has_field_option(field_descriptor, "sql_expression") 134 | 135 | 136 | def _is_id_field(field_descriptor: FieldDescriptor) -> bool: 137 | return _has_field_option(field_descriptor, "id_field") 138 | 139 | 140 | def _has_field_option(field_descriptor: FieldDescriptor, option_name: str) -> bool: 141 | option = list( 142 | filter( 143 | lambda o: o[0].name == option_name, 144 | field_descriptor.GetOptions().ListFields(), 145 | ) 146 | ) 147 | # ListFields returns a list of (FieldDescriptor, value) tuples. This checks that the given option is present 148 | # and that its value is True 149 | return len(option) > 0 and option[0][1] 150 | 151 | 152 | def create_schema_for_enum( 153 | enum: EnumDescriptor, precomputed_types: Dict[str, Validator] 154 | ) -> Validator: 155 | if enum.name in precomputed_types: 156 | return precomputed_types[enum.name] 157 | schema = Enum([v.name for v in enum.values]) 158 | precomputed_types[enum.name] = schema 159 | return schema 160 | 161 | 162 | SCHEMA = create_schema_for_message(semantic_model_pb2.SemanticModel.DESCRIPTOR, {}) 163 | -------------------------------------------------------------------------------- /semantic_model_generator/validate_model.py: -------------------------------------------------------------------------------- 1 | from snowflake.connector import SnowflakeConnection 2 | 3 | from app_utils.chat import send_message 4 | 5 | 6 | def load_yaml(yaml_path: str) -> str: 7 | """ 8 | Load local yaml file into str. 9 | 10 | yaml_path: str The absolute path to the location of your yaml file. Something like path/to/your/file.yaml. 11 | """ 12 | with open(yaml_path) as f: 13 | yaml_str = f.read() 14 | return yaml_str 15 | 16 | 17 | def validate(yaml_str: str, conn: SnowflakeConnection) -> None: 18 | """ 19 | We perform pseudo-validation by issuing a request to Cortex Analyst with the YAML string as-is, and determining 20 | whether the request is successful. We don't currently have an explicit validation endpoint available, but validation 21 | is run at inference time, so this is a reasonable proxy. 22 | 23 | This is done in order to remove the need to sync validation logic locally between these codepaths and Analyst. 24 | 25 | yaml_str: yaml content in string format. 26 | conn: SnowflakeConnection Snowflake connection to pass in 27 | """ 28 | 29 | dummy_request = [ 30 | {"role": "user", "content": [{"type": "text", "text": "SMG app validation"}]} 31 | ] 32 | send_message(conn, yaml_str, dummy_request) 33 | 34 | 35 | def validate_from_local_path(yaml_path: str, conn: SnowflakeConnection) -> None: 36 | yaml_str = load_yaml(yaml_path) 37 | validate(yaml_str, conn) 38 | -------------------------------------------------------------------------------- /sis_setup/app_setup.sql: -------------------------------------------------------------------------------- 1 | SET (streamlit_warehouse)=(SELECT CURRENT_WAREHOUSE()); 2 | 3 | CREATE DATABASE IF NOT EXISTS CORTEX_ANALYST_SEMANTICS 4 | COMMENT = '{"origin": "sf_sit", 5 | "name": "skimantics", 6 | "version": {"major": 2, "minor": 0}, 7 | "attributes": {"deployment": "sis"}}'; 8 | 9 | CREATE SCHEMA IF NOT EXISTS CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR 10 | COMMENT = '{"origin": "sf_sit", 11 | "name": "skimantics", 12 | "version": {"major": 2, "minor": 0}, 13 | "attributes": {"deployment": "sis"}}'; 14 | 15 | -- Create stage for App logic and 3rd party packages 16 | CREATE OR REPLACE STAGE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE 17 | DIRECTORY = (ENABLE = true) 18 | COMMENT = '{"origin": "sf_sit", 19 | "name": "skimantics", 20 | "version": {"major": 2, "minor": 0}, 21 | "attributes": {"deployment": "sis"}}'; 22 | 23 | -- Upload 3rd party packages 24 | -- Run from sis_setup/ as paths are relative to this directory 25 | PUT file://app_utils/*.zip @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 26 | 27 | -- Upload App logic 28 | PUT file://app.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 29 | PUT file://environment.yml @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 30 | PUT file://semantic_model_generator/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 31 | PUT file://semantic_model_generator/data_processing/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/data_processing/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 32 | PUT file://semantic_model_generator/protos/*.p* @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/protos/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 33 | PUT file://semantic_model_generator/snowflake_utils/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/snowflake_utils/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 34 | PUT file://semantic_model_generator/validate/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/validate/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 35 | PUT file://images/*.png @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/images/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 36 | PUT file://journeys/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/journeys/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 37 | PUT file://partner/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/partner/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 38 | PUT file://app_utils/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/app_utils/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE; 39 | 40 | -- Create Streamlit 41 | CREATE OR REPLACE STREAMLIT CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.SEMANTIC_MODEL_GENERATOR 42 | ROOT_LOCATION = '@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator' 43 | MAIN_FILE = 'app.py' 44 | TITLE = "Semantic Model Generator" 45 | IMPORTS = ('@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/looker_sdk.zip', 46 | '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip') 47 | QUERY_WAREHOUSE = $streamlit_warehouse 48 | COMMENT = '{"origin": "sf_sit", 49 | "name": "skimantics", 50 | "version": {"major": 2, "minor": 0}, 51 | "attributes": {"deployment": "sis"}}'; 52 | 53 | 54 | -- Create Semantic Model Generation Callable 55 | -- Zip src files for callable SPROC for generation 56 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES( 57 | database STRING, 58 | schema STRING, 59 | stage STRING, 60 | source_path STRING, 61 | target_parent STRING, 62 | zip_filename STRING 63 | ) 64 | RETURNS VARCHAR 65 | LANGUAGE PYTHON 66 | RUNTIME_VERSION = 3.10 67 | PACKAGES = ( 68 | 'snowflake-snowpark-python==1.18.0' 69 | ) 70 | HANDLER='zip_staged_files' 71 | EXECUTE AS CALLER 72 | AS $$ 73 | from snowflake.snowpark import Session 74 | from typing import Optional 75 | 76 | def get_staged_files(session: Session, 77 | database: str, 78 | schema: str, 79 | stage: str, 80 | target_parent: Optional[str] = None, 81 | source_path: Optional[str] = None, 82 | ) -> dict[str, str]: 83 | 84 | query = f"ls @{database}.{schema}.{stage}/{source_path}" 85 | file_result = session.sql(query).collect() 86 | 87 | file_data = {} 88 | for row in file_result: 89 | filename = row['name'].split('/',1)[1] # Remove the stage name from the filename 90 | 91 | # If target_parent is provided, replace the original file pathing with it 92 | if target_parent: 93 | filename = filename.replace(source_path, f"{target_parent}") 94 | 95 | full_file_path = f"@{database}.{schema}.{row['name']}" 96 | file_data[filename] = session.file.get_stream(f"{full_file_path}").read().decode('utf-8') 97 | 98 | return file_data 99 | 100 | def create_zip(file_data: dict[str, str]) -> bytes: 101 | import io 102 | import zipfile 103 | 104 | zip_buffer = io.BytesIO() 105 | 106 | with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED 107 | ) as zipf: 108 | for filename, content in file_data.items(): 109 | zipf.writestr(filename, content) 110 | 111 | zip_bytes = zip_buffer.getvalue() 112 | 113 | return zip_bytes 114 | 115 | def upload_zip(session: Session, 116 | database: str, 117 | schema: str, 118 | stage: str, 119 | zip_file: bytes, 120 | zip_filename: str, 121 | ) -> None: 122 | import io 123 | 124 | session.file.put_stream( 125 | io.BytesIO(zip_file), 126 | f"@{database}.{schema}.{stage}/{zip_filename.replace('zip','')}.zip", 127 | auto_compress=False, 128 | overwrite=True, 129 | ) 130 | 131 | def zip_staged_files(session: Session, 132 | database: str, 133 | schema: str, 134 | stage: str, 135 | source_path: Optional[str] = None, 136 | target_parent: Optional[str] = None, 137 | zip_filename: Optional[str] = None, 138 | ) -> str: 139 | 140 | file_data = get_staged_files(session, database, schema, stage, target_parent, source_path) 141 | zip_file = create_zip(file_data) 142 | 143 | if zip_filename: 144 | zip_filename = zip_filename 145 | elif target_parent is not None: 146 | zip_filename = target_parent 147 | elif source_path is not None: 148 | zip_filename = source_path 149 | else: 150 | zip_filename = "zipped_files" 151 | 152 | upload_zip(session, database, schema, stage, zip_file, zip_filename) 153 | 154 | return f"Files zipped and uploaded to {database}.{schema}.{stage}/{zip_filename}.zip." 155 | 156 | $$; 157 | 158 | CALL CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES( 159 | 'CORTEX_ANALYST_SEMANTICS', 160 | 'SEMANTIC_MODEL_GENERATOR', 161 | 'streamlit_stage', 162 | 'semantic_model_generator/semantic_model_generator', 163 | 'semantic_model_generator', 164 | 'semantic_model_generator' 165 | ); 166 | 167 | -- Create generation callable 168 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.GENERATE_SEMANTIC_FILE( 169 | STAGE_NAME STRING, 170 | MODEL_NAME STRING, 171 | SAMPLE_VALUE INT, 172 | ALLOW_JOINS BOOLEAN, 173 | TABLE_LIST ARRAY 174 | ) 175 | RETURNS VARCHAR 176 | LANGUAGE PYTHON 177 | RUNTIME_VERSION = 3.10 178 | PACKAGES = ( 179 | 'pandas==2.2.2', 180 | 'tqdm==4.66.5', 181 | 'loguru==0.5.3', 182 | 'protobuf==3.20.3', 183 | 'pydantic==2.8.2', 184 | 'pyyaml==6.0.1', 185 | 'ruamel.yaml==0.17.21', 186 | 'pyarrow==14.0.2', 187 | 'sqlglot==25.10.0', 188 | 'numpy==1.26.4', 189 | 'python-dotenv==0.21.0', 190 | 'urllib3==2.2.2', 191 | 'types-pyyaml==6.0.12.12', 192 | 'types-protobuf==4.25.0.20240417', 193 | 'snowflake-snowpark-python==1.18.0', 194 | 'cattrs==23.1.2', 195 | 'filelock' 196 | ) 197 | IMPORTS = ('@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator.zip', 198 | '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip' 199 | ) 200 | HANDLER='run_generation' 201 | EXECUTE AS CALLER 202 | AS $$ 203 | from snowflake.snowpark import Session 204 | 205 | def import_src_zip(zip_name = 'semantic_model_generator.zip'): 206 | """Unpacks source zip file in stage to enable importing it to mirror source code structure.""" 207 | 208 | import os 209 | import sys 210 | import zipfile 211 | from filelock import FileLock 212 | 213 | # Get the location of the import directory. Snowflake sets the import 214 | # directory location so code can retrieve the location via sys._xoptions. 215 | IMPORT_DIRECTORY_NAME = "snowflake_import_directory" 216 | import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] 217 | 218 | # Get the path to the ZIP file and set the location to extract to. 219 | zip_file_path = import_dir + zip_name 220 | extracted = f'/tmp/{zip_name.replace(".zip", "")}' 221 | 222 | # Extract the contents of the ZIP. This is done under the file lock 223 | # to ensure that only one worker process unzips the contents. 224 | with FileLock('/tmp/extract.lock'): 225 | if not os.path.isdir(extracted): 226 | with zipfile.ZipFile(zip_file_path, 'r') as myzip: 227 | myzip.extractall(extracted) 228 | 229 | # Add in front in case there are conflicting module names including original zipped file 230 | sys.path.insert(0,extracted) 231 | 232 | def run_generation(session: Session, 233 | STAGE_NAME: str, 234 | MODEL_NAME: str, 235 | SAMPLE_VALUE: int, 236 | ALLOW_JOINS: bool, 237 | TABLE_LIST: list[str]) -> str: 238 | 239 | import io 240 | 241 | import_src_zip() 242 | from semantic_model_generator.generate_model import generate_model_str_from_snowflake 243 | 244 | if not MODEL_NAME: 245 | raise ValueError("Please provide a name for your semantic model.") 246 | elif not TABLE_LIST: 247 | raise ValueError("Please select at least one table to proceed.") 248 | else: 249 | yaml_str = generate_model_str_from_snowflake( 250 | base_tables=TABLE_LIST, 251 | semantic_model_name=MODEL_NAME, 252 | n_sample_values=SAMPLE_VALUE, # type: ignore 253 | conn=session.connection, 254 | allow_joins=ALLOW_JOINS, 255 | ) 256 | 257 | session.file.put_stream( 258 | io.BytesIO(yaml_str.encode('utf-8')), 259 | f"@{STAGE_NAME}/{MODEL_NAME}.yaml", 260 | auto_compress=False, 261 | overwrite=True, 262 | ) 263 | return f"Semantic model file {MODEL_NAME}.yaml has been generated and saved to {STAGE_NAME}." 264 | $$; -------------------------------------------------------------------------------- /sis_setup/looker_integration.sql: -------------------------------------------------------------------------------- 1 | USE DATABASE CORTEX_ANALYST_SEMANTICS; 2 | USE SCHEMA SEMANTIC_MODEL_GENERATOR; 3 | 4 | CREATE OR REPLACE NETWORK RULE looker_rule 5 | MODE = EGRESS 6 | TYPE = HOST_PORT 7 | VALUE_LIST = ('<% looker_url %>'); 8 | 9 | CREATE OR REPLACE SECRET looker_client_secret 10 | TYPE = GENERIC_STRING 11 | SECRET_STRING = '<% client_secret %>'; 12 | 13 | CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION looker_access_int 14 | ALLOWED_NETWORK_RULES = (looker_rule) 15 | ALLOWED_AUTHENTICATION_SECRETS = (looker_client_secret) 16 | ENABLED = TRUE; 17 | 18 | GRANT READ ON SECRET looker_client_secret TO ROLE <% streamlit_role %>; 19 | GRANT USAGE ON INTEGRATION looker_access_int TO ROLE <% streamlit_role %>; 20 | 21 | USE ROLE <% streamlit_role %>; 22 | 23 | ALTER STREAMLIT CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.SEMANTIC_MODEL_GENERATOR 24 | SET EXTERNAL_ACCESS_INTEGRATIONS = (looker_access_int) 25 | SECRETS = ('looker_client_secret' = CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.looker_client_secret); -------------------------------------------------------------------------------- /sis_setup/sissetup_snowsightgit.sql: -------------------------------------------------------------------------------- 1 | SET (streamlit_warehouse)=(SELECT CURRENT_WAREHOUSE()); 2 | 3 | CREATE DATABASE IF NOT EXISTS CORTEX_ANALYST_SEMANTICS; 4 | USE DATABASE CORTEX_ANALYST_SEMANTICS; 5 | 6 | -- Create API Integration for Git 7 | CREATE OR REPLACE API INTEGRATION git_api_integration_snowflake_labs 8 | API_PROVIDER = git_https_api 9 | API_ALLOWED_PREFIXES = ('https://github.com/Snowflake-Labs') 10 | ENABLED = TRUE; 11 | 12 | -- Create Git Repository 13 | CREATE OR REPLACE GIT REPOSITORY git_snowflake_semantic_model_generator 14 | API_INTEGRATION = git_api_integration_snowflake_labs 15 | ORIGIN = 'https://github.com/Snowflake-Labs/semantic-model-generator.git'; 16 | 17 | ALTER GIT REPOSITORY git_snowflake_semantic_model_generator FETCH; 18 | 19 | -- Create Schema to host streamlit app 20 | CREATE SCHEMA IF NOT EXISTS CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR 21 | COMMENT = '{"origin": "sf_sit", 22 | "name": "skimantics", 23 | "version": {"major": 2, "minor": 0}, 24 | "attributes": {"deployment": "sis"}}'; 25 | 26 | -- Create stage for App logic and 3rd party packages 27 | CREATE OR REPLACE STAGE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE 28 | DIRECTORY = (ENABLE = true) 29 | COMMENT = '{"origin": "sf_sit", 30 | "name": "skimantics", 31 | "version": {"major": 2, "minor": 0}, 32 | "attributes": {"deployment": "sis"}}'; 33 | 34 | -- Copy Files from Git Repository into App Stage 35 | COPY FILES 36 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE 37 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/app_utils/ 38 | PATTERN='.*[.]zip'; 39 | 40 | COPY FILES 41 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/ 42 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/ 43 | FILES = ('environment.yml', 'app.py'); 44 | 45 | COPY FILES 46 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/ 47 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/semantic_model_generator/ 48 | PATTERN='.*[.]py'; 49 | 50 | RM @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/tests; 51 | RM @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/output_models; 52 | 53 | COPY FILES 54 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/images/ 55 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/images/ 56 | PATTERN='.*[.]png'; 57 | 58 | COPY FILES 59 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/journeys/ 60 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/journeys/ 61 | PATTERN='.*[.]py'; 62 | 63 | COPY FILES 64 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/partner/ 65 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/partner/ 66 | PATTERN='.*[.]py'; 67 | 68 | COPY FILES 69 | INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/app_utils/ 70 | FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/app_utils/ 71 | PATTERN='.*[.]py'; 72 | 73 | -- Create Streamlit App 74 | CREATE OR REPLACE STREAMLIT CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.SEMANTIC_MODEL_GENERATOR 75 | ROOT_LOCATION = '@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator' 76 | MAIN_FILE = 'app.py' 77 | TITLE = "Semantic Model Generator" 78 | IMPORTS = ('@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/looker_sdk.zip', 79 | '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip') 80 | QUERY_WAREHOUSE = $streamlit_warehouse 81 | COMMENT = '{"origin": "sf_sit", 82 | "name": "skimantics", 83 | "version": {"major": 2, "minor": 0}, 84 | "attributes": {"deployment": "sis"}}'; 85 | 86 | 87 | -- Create Semantic Model Generation Callable 88 | -- Zip src files for callable SPROC for generation 89 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES( 90 | database STRING, 91 | schema STRING, 92 | stage STRING, 93 | source_path STRING, 94 | target_parent STRING, 95 | zip_filename STRING 96 | ) 97 | RETURNS VARCHAR 98 | LANGUAGE PYTHON 99 | RUNTIME_VERSION = 3.10 100 | PACKAGES = ( 101 | 'snowflake-snowpark-python==1.18.0' 102 | ) 103 | HANDLER='zip_staged_files' 104 | EXECUTE AS CALLER 105 | AS $$ 106 | from snowflake.snowpark import Session 107 | from typing import Optional 108 | 109 | def get_staged_files(session: Session, 110 | database: str, 111 | schema: str, 112 | stage: str, 113 | target_parent: Optional[str] = None, 114 | source_path: Optional[str] = None, 115 | ) -> dict[str, str]: 116 | 117 | query = f"ls @{database}.{schema}.{stage}/{source_path}" 118 | file_result = session.sql(query).collect() 119 | 120 | file_data = {} 121 | for row in file_result: 122 | filename = row['name'].split('/',1)[1] # Remove the stage name from the filename 123 | 124 | # If target_parent is provided, replace the original file pathing with it 125 | if target_parent: 126 | filename = filename.replace(source_path, f"{target_parent}") 127 | 128 | full_file_path = f"@{database}.{schema}.{row['name']}" 129 | file_data[filename] = session.file.get_stream(f"{full_file_path}").read().decode('utf-8') 130 | 131 | return file_data 132 | 133 | def create_zip(file_data: dict[str, str]) -> bytes: 134 | import io 135 | import zipfile 136 | 137 | zip_buffer = io.BytesIO() 138 | 139 | with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED 140 | ) as zipf: 141 | for filename, content in file_data.items(): 142 | zipf.writestr(filename, content) 143 | 144 | zip_bytes = zip_buffer.getvalue() 145 | 146 | return zip_bytes 147 | 148 | def upload_zip(session: Session, 149 | database: str, 150 | schema: str, 151 | stage: str, 152 | zip_file: bytes, 153 | zip_filename: str, 154 | ) -> None: 155 | import io 156 | 157 | session.file.put_stream( 158 | io.BytesIO(zip_file), 159 | f"@{database}.{schema}.{stage}/{zip_filename.replace('zip','')}.zip", 160 | auto_compress=False, 161 | overwrite=True, 162 | ) 163 | 164 | def zip_staged_files(session: Session, 165 | database: str, 166 | schema: str, 167 | stage: str, 168 | source_path: Optional[str] = None, 169 | target_parent: Optional[str] = None, 170 | zip_filename: Optional[str] = None, 171 | ) -> str: 172 | 173 | file_data = get_staged_files(session, database, schema, stage, target_parent, source_path) 174 | zip_file = create_zip(file_data) 175 | 176 | if zip_filename: 177 | zip_filename = zip_filename 178 | elif target_parent is not None: 179 | zip_filename = target_parent 180 | elif source_path is not None: 181 | zip_filename = source_path 182 | else: 183 | zip_filename = "zipped_files" 184 | 185 | upload_zip(session, database, schema, stage, zip_file, zip_filename) 186 | 187 | return f"Files zipped and uploaded to {database}.{schema}.{stage}/{zip_filename}.zip." 188 | 189 | $$; 190 | 191 | CALL CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES( 192 | 'CORTEX_ANALYST_SEMANTICS', 193 | 'SEMANTIC_MODEL_GENERATOR', 194 | 'streamlit_stage', 195 | 'semantic_model_generator/semantic_model_generator', 196 | 'semantic_model_generator', 197 | 'semantic_model_generator' 198 | ); 199 | 200 | -- Create generation callable 201 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.GENERATE_SEMANTIC_FILE( 202 | STAGE_NAME STRING, 203 | MODEL_NAME STRING, 204 | SAMPLE_VALUE INT, 205 | ALLOW_JOINS BOOLEAN, 206 | TABLE_LIST ARRAY 207 | ) 208 | RETURNS VARCHAR 209 | LANGUAGE PYTHON 210 | RUNTIME_VERSION = 3.10 211 | PACKAGES = ( 212 | 'pandas==2.2.2', 213 | 'tqdm==4.66.5', 214 | 'loguru==0.5.3', 215 | 'protobuf==3.20.3', 216 | 'pydantic==2.8.2', 217 | 'pyyaml==6.0.1', 218 | 'ruamel.yaml==0.17.21', 219 | 'pyarrow==14.0.2', 220 | 'sqlglot==25.10.0', 221 | 'numpy==1.26.4', 222 | 'python-dotenv==0.21.0', 223 | 'urllib3==2.2.2', 224 | 'types-pyyaml==6.0.12.12', 225 | 'types-protobuf==4.25.0.20240417', 226 | 'snowflake-snowpark-python==1.18.0', 227 | 'cattrs==23.1.2', 228 | 'filelock' 229 | ) 230 | IMPORTS = ('@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator.zip', 231 | '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip' 232 | ) 233 | HANDLER='run_generation' 234 | EXECUTE AS CALLER 235 | AS $$ 236 | from snowflake.snowpark import Session 237 | 238 | def import_src_zip(zip_name = 'semantic_model_generator.zip'): 239 | """Unpacks source zip file in stage to enable importing it to mirror source code structure.""" 240 | 241 | import os 242 | import sys 243 | import zipfile 244 | from filelock import FileLock 245 | 246 | # Get the location of the import directory. Snowflake sets the import 247 | # directory location so code can retrieve the location via sys._xoptions. 248 | IMPORT_DIRECTORY_NAME = "snowflake_import_directory" 249 | import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME] 250 | 251 | # Get the path to the ZIP file and set the location to extract to. 252 | zip_file_path = import_dir + zip_name 253 | extracted = f'/tmp/{zip_name.replace(".zip", "")}' 254 | 255 | # Extract the contents of the ZIP. This is done under the file lock 256 | # to ensure that only one worker process unzips the contents. 257 | with FileLock('/tmp/extract.lock'): 258 | if not os.path.isdir(extracted): 259 | with zipfile.ZipFile(zip_file_path, 'r') as myzip: 260 | myzip.extractall(extracted) 261 | 262 | # Add in front in case there are conflicting module names including original zipped file 263 | sys.path.insert(0,extracted) 264 | 265 | def run_generation(session: Session, 266 | STAGE_NAME: str, 267 | MODEL_NAME: str, 268 | SAMPLE_VALUE: int, 269 | ALLOW_JOINS: bool, 270 | TABLE_LIST: list[str]) -> str: 271 | 272 | import io 273 | 274 | import_src_zip() 275 | from semantic_model_generator.generate_model import generate_model_str_from_snowflake 276 | 277 | if not MODEL_NAME: 278 | raise ValueError("Please provide a name for your semantic model.") 279 | elif not TABLE_LIST: 280 | raise ValueError("Please select at least one table to proceed.") 281 | else: 282 | yaml_str = generate_model_str_from_snowflake( 283 | base_tables=TABLE_LIST, 284 | semantic_model_name=MODEL_NAME, 285 | n_sample_values=SAMPLE_VALUE, # type: ignore 286 | conn=session.connection, 287 | allow_joins=ALLOW_JOINS, 288 | ) 289 | 290 | session.file.put_stream( 291 | io.BytesIO(yaml_str.encode('utf-8')), 292 | f"@{STAGE_NAME}/{MODEL_NAME}.yaml", 293 | auto_compress=False, 294 | overwrite=True, 295 | ) 296 | return f"Semantic model file {MODEL_NAME}.yaml has been generated and saved to {STAGE_NAME}." 297 | $$; --------------------------------------------------------------------------------