├── .env.example
├── .github
    ├── CODEOWNERS
    └── workflows
    │   ├── lint.yaml
    │   ├── release.yaml
    │   └── test.yaml
├── .gitignore
├── CHANGELOG.md
├── LEGAL.md
├── LICENSE
├── Makefile
├── README.md
├── app.py
├── app_utils
    ├── __init__.py
    ├── chat.py
    ├── looker_sdk.zip
    ├── shared_utils.py
    └── strictyaml.zip
├── artifacts
    ├── SMG_DEMO.csv
    └── customers.yml
├── environment.yml
├── images
    ├── dbt-signature_tm_black.png
    ├── error39.png
    └── looker.png
├── journeys
    ├── __init__.py
    ├── builder.py
    ├── evaluation.py
    ├── iteration.py
    ├── joins.py
    └── partner.py
├── mypy.ini
├── partner
    ├── __init__.py
    ├── cortex.py
    ├── dbt.py
    ├── looker.py
    └── partner_utils.py
├── poetry.lock
├── pyproject.toml
├── semantic_model_generator
    ├── __init__.py
    ├── data_processing
    │   ├── __init__.py
    │   ├── cte_utils.py
    │   ├── cte_utils_test.py
    │   ├── data_types.py
    │   └── proto_utils.py
    ├── generate_model.py
    ├── output_models
    │   └── .keep
    ├── protos
    │   ├── semantic_model.proto
    │   ├── semantic_model_pb2.py
    │   └── semantic_model_pb2.pyi
    ├── snowflake_utils
    │   ├── env_vars.py
    │   ├── snowflake_connector.py
    │   └── utils.py
    ├── tests
    │   ├── cte_utils_test.py
    │   ├── generate_model_test.py
    │   ├── samples
    │   │   └── validate_yamls.py
    │   ├── snowflake_connector_test.py
    │   ├── utils_test.py
    │   ├── validate_model_test.py
    │   └── yaml_to_semantic_model_test.py
    ├── validate
    │   ├── context_length.py
    │   ├── keywords.py
    │   └── schema.py
    └── validate_model.py
└── sis_setup
    ├── app_setup.sql
    ├── looker_integration.sql
    └── sissetup_snowsightgit.sql


/.env.example:
--------------------------------------------------------------------------------
 1 | # Example config for username/password auth
 2 | SNOWFLAKE_ROLE="<my_role>"
 3 | SNOWFLAKE_WAREHOUSE="<my_warehouse>"
 4 | SNOWFLAKE_USER="<my_user>"
 5 | SNOWFLAKE_PASSWORD="<my_pw>"
 6 | SNOWFLAKE_ACCOUNT_LOCATOR="<my_snowflake_account>"
 7 | SNOWFLAKE_HOST="<my_snowflake_host>"
 8 | 
 9 | 
10 | # Example config for externalbrowser auth
11 | SNOWFLAKE_ROLE="<my_role>"
12 | SNOWFLAKE_WAREHOUSE="<my_warehouse>"
13 | SNOWFLAKE_USER="<my_user>"
14 | SNOWFLAKE_PASSWORD="<my_pw>"
15 | SNOWFLAKE_ACCOUNT_LOCATOR="<my_snowflake_account>"
16 | SNOWFLAKE_HOST="<my_snowflake_host>"
17 | SNOWFLAKE_AUTHENTICATOR="externalbrowser"
18 | 
19 | 
20 | # Example config for username/password auth using MFA
21 | SNOWFLAKE_ROLE="<my_role>"
22 | SNOWFLAKE_WAREHOUSE="<my_warehouse>"
23 | SNOWFLAKE_USER="<my_user>"
24 | SNOWFLAKE_PASSWORD="<my_pw>"
25 | SNOWFLAKE_ACCOUNT_LOCATOR="<my_snowflake_account>"
26 | SNOWFLAKE_HOST="<my_snowflake_host>"
27 | SNOWFLAKE_AUTHENTICATOR="username_password_mfa"
28 | SNOWFLAKE_MFA_PASSCODE="<my_mfa_passcode>"
29 | 


--------------------------------------------------------------------------------
/.github/CODEOWNERS:
--------------------------------------------------------------------------------
1 | * @sfc-gh-cnivera @sfc-gh-jsummer
2 | /semantic_model_generator/ @sfc-gh-rehuang @sfc-gh-cnivera @sfc-gh-jsummer
3 | 


--------------------------------------------------------------------------------
/.github/workflows/lint.yaml:
--------------------------------------------------------------------------------
 1 | name: Semantic Model Format & Lint
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     branches:
 6 |       - "*"
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     strategy:
12 |       matrix:
13 |         python-version: [ "3.10" ]
14 | 
15 |     steps:
16 |       - name: Check out the code
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       - name: Cache Poetry virtualenv
25 |         uses: actions/cache@v4
26 |         with:
27 |           path: ~/.cache/pypoetry/virtualenvs
28 |           key: ${{ runner.os }}-poetry-${{ hashFiles('poetry.lock') }}
29 |           restore-keys: |
30 |             ${{ runner.os }}-poetry-
31 | 
32 |       - name: Install Poetry
33 |         run: |
34 |           python3 -m pip install --user pipx
35 |           python3 -m pipx ensurepath
36 |           pipx install poetry
37 | 
38 |       - name: Configure Poetry
39 |         run: |
40 |           export PATH="$HOME/.local/bin:$PATH"
41 |           poetry config virtualenvs.create false
42 | 
43 |       - name: Install dependencies using Poetry
44 |         run: |
45 |           poetry install --no-interaction
46 | 
47 |       - name: Run mypy
48 |         id: mypy
49 |         run: |
50 |           make run_mypy
51 |         continue-on-error: true
52 | 
53 |       - name: Check with black
54 |         id: black
55 |         run: |
56 |           make check_black
57 |         continue-on-error: true
58 | 
59 |       - name: Check with isort
60 |         id: isort
61 |         run: |
62 |           make check_isort
63 |         continue-on-error: true
64 | 
65 |       - name: Run flake8
66 |         id: flake8
67 |         run: |
68 |           make run_flake8
69 |         continue-on-error: true
70 | 
71 |       - name: Report failures
72 |         run: |
73 |           if [ "${{ steps.black.outcome }}" != "success" ]; then echo "black failed"; FAIL=1; fi
74 |           if [ "${{ steps.isort.outcome }}" != "success" ]; then echo "isort failed"; FAIL=1; fi
75 |           if [ "${{ steps.flake8.outcome }}" != "success" ]; then echo "flake8 failed"; FAIL=1; fi
76 |           if [ "$FAIL" == "1" ]; then exit 1; fi
77 |         continue-on-error: false
78 | 


--------------------------------------------------------------------------------
/.github/workflows/release.yaml:
--------------------------------------------------------------------------------
 1 | name: Build and Attach Wheel to GitHub Release
 2 | 
 3 | on:
 4 |   push:
 5 |     tags:
 6 |       - 'release/v*'
 7 | 
 8 | jobs:
 9 |   build:
10 |     runs-on: ubuntu-latest
11 |     steps:
12 |       - uses: actions/checkout@v4
13 |       - name: Set up Python
14 |         uses: actions/setup-python@v4
15 |         with:
16 |           python-version: '3.10'
17 |       - name: Install Poetry
18 |         run: pip install poetry
19 |       - name: Get the version from pyproject.toml
20 |         run: echo "VERSION=$(poetry version -s)" >> $GITHUB_ENV
21 |       - name: Build Wheel
22 |         run: poetry build -f wheel
23 |       - name: Extract Changelog for the Version
24 |         run: |
25 |           VERSION=${{ env.VERSION }}
26 |           CHANGELOG=$(awk '/^## \['"${VERSION//./\\.}"'\]/ {flag=1; next} /^## \[/ {flag=0} flag' CHANGELOG.md)
27 |           echo "CHANGELOG<<EOF" >> $GITHUB_ENV
28 |           echo "$CHANGELOG" >> $GITHUB_ENV
29 |           echo "EOF" >> $GITHUB_ENV
30 |       - name: Upload Wheel to Release
31 |         uses: softprops/action-gh-release@v1
32 |         with:
33 |           files: dist/*.whl
34 |           body: ${{ env.CHANGELOG }}
35 |         env:
36 |           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
37 | 


--------------------------------------------------------------------------------
/.github/workflows/test.yaml:
--------------------------------------------------------------------------------
 1 | name: Semantic Model Generator Test
 2 | 
 3 | on:
 4 |   pull_request:
 5 |     paths:
 6 |       - "semantic_model_generator/**"
 7 |       - "pyproject.toml"
 8 | 
 9 | jobs:
10 |   build:
11 |     runs-on: ubuntu-latest
12 |     strategy:
13 |       matrix:
14 |         python-version: [ "3.10" ]
15 |     steps:
16 |       - name: Check out the code
17 |         uses: actions/checkout@v4
18 | 
19 |       - name: Set up Python ${{ matrix.python-version }}
20 |         uses: actions/setup-python@v5
21 |         with:
22 |           python-version: ${{ matrix.python-version }}
23 | 
24 |       # Caching dependencies using Poetry
25 |       - name: Cache Poetry virtualenv
26 |         uses: actions/cache@v4
27 |         with:
28 |           path: ~/.cache/pypoetry/virtualenvs
29 |           key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
30 |           restore-keys: |
31 |             ${{ runner.os }}-poetry-
32 | 
33 |       - name: Install Poetry
34 |         run: |
35 |           curl -sSL https://install.python-poetry.org | python3 -
36 | 
37 |       - name: Configure Poetry
38 |         run: |
39 |           $HOME/.local/bin/poetry config virtualenvs.create false
40 | 
41 |       - name: Install dependencies using Poetry
42 |         run: |
43 |           $HOME/.local/bin/poetry install --no-interaction
44 | 
45 |       - name: Test
46 |         run: |
47 |           make test_github_workflow
48 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | # Local python environment
 2 | pyvenv
 3 | *.DS_Store
 4 | 
 5 | # Notebook intermediate state
 6 | *.ipynb_checkpoints
 7 | 
 8 | # Mypy
 9 | .mypy_cache
10 | 
11 | # Pytest
12 | .pytest_cache
13 | 
14 | # pycache
15 | **/__pycache__
16 | 
17 | # Python package builds
18 | *.egg-info
19 | 
20 | # VSCode
21 | .vscode/settings.json
22 | .vscode/launch.json
23 | .vscode/.ropeproject
24 | .vscode/*.log
25 | .vscode/*.json
26 | 
27 | # Jetbrains
28 | .idea/*
29 | 
30 | # Envs
31 | .env
32 | .venv
33 | .direnv
34 | .envrc
35 | 
36 | # Output semantic models
37 | semantic_model_generator/output_models/*.yaml
38 | 
39 | # test coverage
40 | .coverage


--------------------------------------------------------------------------------
/CHANGELOG.md:
--------------------------------------------------------------------------------
  1 | # Changelog
  2 | 
  3 | You must follow the format of `## [VERSION-NUMBER]` for the GitHub workflow to pick up the text.
  4 | 
  5 | ## [0.1.33] - 2024-08-07
  6 | 
  7 | ### Updates
  8 | 
  9 | - Throw an error during validation if a user adds duplicate verified queries to their semantic model.
 10 | 
 11 | ## [0.1.32] - 2024-07-30
 12 | 
 13 | ### Updates
 14 | 
 15 | - Bump context length validation limit.
 16 | - Fix union type hints for support with Python <3.10.
 17 | 
 18 | ## [0.1.31] - 2024-07-29
 19 | 
 20 | ### Updates
 21 | 
 22 | - Include new `secure-local-storage` extra package for `snowflake-python-connector` dependency.
 23 | 
 24 | ## [0.1.30] - 2024-07-12
 25 | 
 26 | ### Updates
 27 | 
 28 | - Restrict Python version to < 3.12 in order to avoid issues with pyarrow dependency.
 29 | 
 30 | ## [0.1.29] - 2024-07-10
 31 | 
 32 | ### Updates
 33 | 
 34 | - Allow single sign on auth.
 35 | 
 36 | ## [0.1.28] - 2024-07-09
 37 | 
 38 | ### Updates
 39 | 
 40 | - Allow auto-generation of descriptions for semantic models.
 41 | 
 42 | ## [0.1.27] - 2024-07-03
 43 | 
 44 | ### Updates
 45 | 
 46 | - Fix VQR validation for measures with aggregation calculation.
 47 | - Update pulling sample value by dimension vs. measures; fix length validation logic.
 48 | 
 49 | ## [0.1.26] - 2024-07-02
 50 | 
 51 | ### Updates
 52 | 
 53 | - Semantic model size validation allows for many more sample values.
 54 |   This corresponds with a release of the Cortex Analyst that does dynamic sample value retrieval by default.
 55 | 
 56 | ## [0.1.25] - 2024-06-18
 57 | 
 58 | ### Updates
 59 | 
 60 | - Plumb through column and table comments
 61 | - Skip host name match verification for now
 62 | 
 63 | ## [0.1.24] - 2024-06-17
 64 | 
 65 | ### Updates
 66 | 
 67 | - Consolidate validations to use the same set of utils
 68 | - Handle the validation for expr with aggregations properly
 69 | 
 70 | ## [0.1.23] - 2024-06-13
 71 | 
 72 | ### Updates
 73 | 
 74 | - Remove VQR from context length calculation.
 75 | - Add toggle for number of sample values.
 76 | 
 77 | ## [0.1.22] - 2024-06-11
 78 | 
 79 | ### Updates
 80 | 
 81 | - Fix small streamlit app components to be compatible with python 3.8
 82 | 
 83 | ## [0.1.21] - 2024-06-10
 84 | 
 85 | ### Updates
 86 | 
 87 | - Add validation for verified queries;
 88 | - Add streamlit admin app for semantic model generation, validation and verified query flow.
 89 | 
 90 | ## [0.1.20] - 2024-05-31
 91 | 
 92 | ### Updates
 93 | 
 94 | - Fix for validation CLI and README
 95 | 
 96 | ## [0.1.19] - 2024-05-31
 97 | 
 98 | ### Updates
 99 | 
100 | - Fix protobuf version to be compatible with streamlit
101 | - Small refactor in validation file
102 | 
103 | ## [0.1.18] - 2024-05-31
104 | 
105 | ### Updates
106 | 
107 | - Add proto definition for verified queries; also add proto for Column (for backward compatibility only)
108 | 
109 | ## [0.1.17] - 2024-05-21
110 | 
111 | ### Updates
112 | 
113 | - Allow flow style in yaml validation
114 | 
115 | ## [0.1.16] - 2024-05-15
116 | 
117 | ### Updates
118 | 
119 | - Remove validation of context length to after save.
120 | - Uppercase db/schema/table(s)
121 | 
122 | ## [0.1.15] - 2024-05-14
123 | 
124 | ### Updates
125 | 
126 | - Use strictyaml to validate the semantic model yaml matches the expected schema and has all required fields
127 | 
128 | ## [0.1.14] - 2024-05-13
129 | 
130 | ### Updates
131 | 
132 | - Fix aggregations
133 | - Context limit
134 | 
135 | ## [0.1.13] - 2024-05-08
136 | 
137 | ### Updates
138 | 
139 | - Object types not supported in generation or validation.
140 | 
141 | ## [0.1.12] - 2024-05-03
142 | 
143 | ### Updates
144 | 
145 | - Naming
146 | - Validate no expressions in cols in yaml
147 | 
148 | ## [0.1.11] - 2024-05-01
149 | 
150 | ### Updates
151 | 
152 | - Save path location
153 | 
154 | ## [0.1.10] - 2024-05-01
155 | 
156 | ### Updates
157 | 
158 | - Save path location
159 | 
160 | ## [0.1.9] - 2024-04-29
161 | 
162 | ### Updates
163 | 
164 | - Add additional validation for mismatched quotes. Test incorrect enums.
165 | 
166 | ## [0.1.8] - 2024-04-23
167 | 
168 | ### Updates
169 | 
170 | - run select against given cols in semantic model for validation
171 | 
172 | ## [0.1.7] - 2024-04-18
173 | 
174 | ### Updates
175 | 
176 | - Parse yaml model into protos, validate cols and col naming
177 | 
178 | ## [0.1.6] - 2024-04-16
179 | 
180 | ### Updates
181 | 
182 | - First yaml validation included.
183 | 
184 | ## [0.1.5] - 2024-04-15d
185 | 
186 | ### Updates
187 | 
188 | - Downgrade pyarrow
189 | 
190 | ## [0.1.4] - 2024-04-15c
191 | 
192 | ### Updates
193 | 
194 | - Spacing typo
195 | 
196 | ## [0.1.3] - 2024-04-15b
197 | 
198 | ### Updates
199 | 
200 | - Fix 3.8 typing
201 | - Some function renaming
202 | - Support all Snowflake datatypes
203 | 
204 | ## [0.1.2] - 2024-04-15
205 | 
206 | ### Updates
207 | 
208 | - Downgrade to python 3.8 and resolve typing issues with optional.
209 | - Fix FQN parts for pydantic errors.
210 | - Update README to be less restrictive for installs.
211 | 
212 | ## [0.1.1] - 2024-04-09
213 | 
214 | ### Released
215 | 
216 | - Verify release workflow works as intended
217 | 
218 | ## [0.1.0] - 2024-04-08
219 | 
220 | ### Released
221 | 
222 | - Initial release of the project.
223 | 


--------------------------------------------------------------------------------
/LEGAL.md:
--------------------------------------------------------------------------------
1 | This application is not part of the Snowflake Service and is governed by the terms in LICENSE, unless expressly agreed to in writing. You use this application at your own risk, and Snowflake has no obligation to support your use of this application.
2 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------
/Makefile:
--------------------------------------------------------------------------------
 1 | .PHONY: run_admin_app
 2 | 
 3 | install-poetry:
 4 | 	curl -sSL https://install.python-poetry.org | python3 -
 5 | 
 6 | install-homebrew:
 7 | 	/bin/bash -c "$$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)"
 8 | 
 9 | install-pyenv:
10 | 	@command -v brew >/dev/null 2>&1 || $(MAKE) install-homebrew
11 | 	brew install pyenv
12 | 
13 | install-python-3.8:
14 | 	@echo "Python 3.8 not found. Installing Python 3.8 using pyenv."
15 | 	@pyenv install 3.8
16 | 	@pyenv local 3.8
17 | 
18 | check-deps:
19 | 	@command -v poetry >/dev/null 2>&1 || $(MAKE) install-poetry
20 | 
21 | 
22 | shell: check-deps ## Get into a poetry shell
23 | 	poetry shell
24 | 
25 | setup: check-deps shell ## Install dependencies into your poetry environment.
26 | 	poetry install
27 | 
28 | # app
29 | run_admin_app:
30 | 	python -m streamlit run app.py
31 | 
32 | # Installs dependencies for the admin app.
33 | setup_admin_app:
34 | 	pip install .
35 | 
36 | # Linting and formatting below.
37 | run_mypy:  ## Run mypy
38 | 	mypy --config-file=mypy.ini .
39 | 
40 | run_flake8:  ## Run flake8
41 | 	flake8 --ignore=E203,E501,W503 --exclude=venv,.venv,pyvenv,tmp,*_pb2.py,*_pb2.pyi,images/*/src .
42 | 
43 | check_black:  ## Check to see if files would be updated with black.
44 |     # Exclude pyvenv and all generated protobuf code.
45 | 	black --check --exclude=".venv|venv|pyvenv|.*_pb2.py|.*_pb2.pyi" .
46 | 
47 | run_black:  ## Run black to format files.
48 |     # Exclude pyvenv, tmp, and all generated protobuf code.
49 | 	black --exclude=".venv|venv|pyvenv|tmp|.*_pb2.py|.*_pb2.pyi" .
50 | 
51 | check_isort:  ## Check if files would be updated with isort.
52 | 	isort --profile black --check --skip=venv --skip=pyvenv --skip=.venv --skip-glob='*_pb2.py*' .
53 | 
54 | run_isort:  ## Run isort to update imports.
55 | 	isort --profile black --skip=pyvenv --skip=venv --skip=tmp --skip=.venv --skip-glob='*_pb2.py*' .
56 | 
57 | 
58 | fmt_lint: shell ## lint/fmt in current python environment
59 | 	make run_black run_isort run_flake8
60 | 
61 | # Test below
62 | test: shell ## Run tests.
63 | 	python -m pytest -vvs semantic_model_generator
64 | 
65 | test_github_workflow:  ## For use on github workflow.
66 | 	python -m pytest -vvs semantic_model_generator
67 | 
68 | # Release
69 | update-version: ## Bump poetry and github version. TYPE should be `patch` `minor` or `major`
70 | 	@echo "Updating Poetry version ($(TYPE)) and creating a Git tag..."
71 | 	@poetry version $(TYPE)
72 | 	@echo "Version updated to $$VERSION. Update the CHANGELOG.md `make release`"
73 | 
74 | release: ## Runs the release workflow.
75 | 	@VERSION=$$(poetry version -s) && git commit --allow-empty  -m "Bump version to $$VERSION" && git tag release/v$$VERSION && \
76 |  	git push origin HEAD && git push origin HEAD --tags
77 | 
78 | build: ## Clean the dist dir and build the whl file
79 | 	rm -rf dist
80 | 	mkdir dist
81 | 	poetry build
82 | 
83 | help: ## Show this help.
84 | 	@fgrep -h "##" $(MAKEFILE_LIST) | fgrep -v fgrep | sed -e 's/\\$$//' | sed -e 's


--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from snowflake.connector import DatabaseError
  3 | from snowflake.connector.connection import SnowflakeConnection
  4 | 
  5 | # set_page_config must be run as the first Streamlit command on the page, before any other streamlit imports.
  6 | st.set_page_config(layout="wide", page_icon="💬", page_title="Semantic Model Generator")
  7 | 
  8 | from app_utils.shared_utils import (  # noqa: E402
  9 |     GeneratorAppScreen,
 10 |     get_snowflake_connection,
 11 |     set_account_name,
 12 |     set_host_name,
 13 |     set_sit_query_tag,
 14 |     set_snowpark_session,
 15 |     set_streamlit_location,
 16 |     set_user_name,
 17 | )
 18 | from semantic_model_generator.snowflake_utils.env_vars import (  # noqa: E402
 19 |     SNOWFLAKE_ACCOUNT_LOCATOR,
 20 |     SNOWFLAKE_HOST,
 21 |     SNOWFLAKE_USER,
 22 | )
 23 | 
 24 | 
 25 | @st.experimental_dialog(title="Connection Error")
 26 | def failed_connection_popup() -> None:
 27 |     """
 28 |     Renders a dialog box detailing that the credentials provided could not be used to connect to Snowflake.
 29 |     """
 30 |     st.markdown(
 31 |         """It looks like the credentials provided could not be used to connect to the account."""
 32 |     )
 33 |     st.stop()
 34 | 
 35 | 
 36 | def verify_environment_setup() -> SnowflakeConnection:
 37 |     """
 38 |     Ensures that the correct environment variables are set before proceeding.
 39 |     """
 40 | 
 41 |     # Instantiate the Snowflake connection that gets reused throughout the app.
 42 |     try:
 43 |         with st.spinner(
 44 |             "Validating your connection to Snowflake. If you are using MFA, please check your authenticator app for a push notification."
 45 |         ):
 46 |             return get_snowflake_connection()
 47 |     except DatabaseError:
 48 |         failed_connection_popup()
 49 | 
 50 | 
 51 | if __name__ == "__main__":
 52 |     from journeys import builder, iteration, partner
 53 | 
 54 |     st.session_state["sis"] = set_streamlit_location()
 55 | 
 56 |     def onboarding_dialog() -> None:
 57 |         """
 58 |         Renders the initial screen where users can choose to create a new semantic model or edit an existing one.
 59 |         """
 60 | 
 61 |         # Direct to specific page based instead of default onboarding if user comes from successful partner setup
 62 |         st.markdown(
 63 |             """
 64 |                 <div style="text-align: center;">
 65 |                     <h1>Welcome to the Snowflake Semantic Model Generator! ❄️</h1>
 66 |                     <p>⚠️  Heads up! The Streamlit app is no longer supported for semantic model creation.
 67 |                     <p>👉 Please use the Snowsight UI in Snowflake to create and update semantic models — it’s newer and works better! </p>
 68 |                     <p>✅ Once your model is created in Snowsight, come back here to run evaluations, which still work best in this app.</p>
 69 |                 </div>
 70 |             """,
 71 |             unsafe_allow_html=True,
 72 |         )
 73 | 
 74 |         st.markdown("<div style='margin: 60px;'></div>", unsafe_allow_html=True)
 75 | 
 76 |         _, center, _ = st.columns([1, 2, 1])
 77 |         with center:
 78 |             if st.button(
 79 |                 "**[⚠️ Deprecated]🛠 Create a new semantic model**",
 80 |                 use_container_width=True,
 81 |                 type="primary",
 82 |             ):
 83 |                 builder.show()
 84 |             st.markdown("")
 85 |             if st.button(
 86 |                 "**✏️ Edit an existing semantic model**",
 87 |                 use_container_width=True,
 88 |                 type="primary",
 89 |             ):
 90 |                 iteration.show()
 91 |             st.markdown("")
 92 |             if st.button(
 93 |                 "**[⚠️ Deprecated]📦 Start with partner semantic model**",
 94 |                 use_container_width=True,
 95 |                 type="primary",
 96 |             ):
 97 |                 set_sit_query_tag(
 98 |                     get_snowflake_connection(),
 99 |                     vendor="",
100 |                     action="start",
101 |                 )
102 |                 partner.show()
103 | 
104 |     conn = verify_environment_setup()
105 |     set_snowpark_session(conn)
106 | 
107 |     # Populating common state between builder and iteration apps.
108 |     set_account_name(conn, SNOWFLAKE_ACCOUNT_LOCATOR)
109 |     set_host_name(conn, SNOWFLAKE_HOST)
110 |     set_user_name(conn, SNOWFLAKE_USER)
111 | 
112 |     # When the app first loads, show the onboarding screen.
113 |     if "page" not in st.session_state:
114 |         st.session_state["page"] = GeneratorAppScreen.ONBOARDING
115 | 
116 |     # Depending on the page state, we either show the onboarding menu or the chat app flow.
117 |     # The builder flow is simply an intermediate dialog before the iteration flow.
118 |     if st.session_state["page"] == GeneratorAppScreen.ITERATION:
119 |         iteration.show()
120 |     else:
121 |         onboarding_dialog()
122 | 


--------------------------------------------------------------------------------
/app_utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/app_utils/__init__.py


--------------------------------------------------------------------------------
/app_utils/chat.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | from typing import Any, Dict
 4 | 
 5 | import requests
 6 | import streamlit as st
 7 | from snowflake.connector import SnowflakeConnection
 8 | 
 9 | API_ENDPOINT = "https://{HOST}/api/v2/cortex/analyst/message"
10 | 
11 | 
12 | @st.cache_data(ttl=60, show_spinner=False)
13 | def send_message(
14 |     _conn: SnowflakeConnection, semantic_model: str, messages: list[dict[str, str]]
15 | ) -> Dict[str, Any]:
16 |     """
17 |     Calls the REST API with a list of messages and returns the response.
18 |     Args:
19 |         _conn: SnowflakeConnection, used to grab the token for auth.
20 |         messages: list of chat messages to pass to the Analyst API.
21 |         semantic_model: stringified YAML of the semantic model.
22 | 
23 |     Returns: The raw ChatMessage response from Analyst.
24 |     """
25 |     request_body = {
26 |         "messages": messages,
27 |         "semantic_model": semantic_model,
28 |     }
29 | 
30 |     if st.session_state["sis"]:
31 |         import _snowflake
32 | 
33 |         resp = _snowflake.send_snow_api_request(  # type: ignore
34 |             "POST",
35 |             "/api/v2/cortex/analyst/message",
36 |             {},
37 |             {},
38 |             request_body,
39 |             {},
40 |             30000,
41 |         )
42 |         if resp["status"] < 400:
43 |             json_resp: Dict[str, Any] = json.loads(resp["content"])
44 |             return json_resp
45 |         else:
46 |             err_body = json.loads(resp["content"])
47 |             if "message" in err_body:
48 |                 # Certain errors have a message payload with a link to the github repo, which we should remove.
49 |                 error_msg = re.sub(
50 |                     r"\s*Please use https://github\.com/Snowflake-Labs/semantic-model-generator.*",
51 |                     "",
52 |                     err_body["message"],
53 |                 )
54 |                 raise ValueError(error_msg)
55 |             raise ValueError(err_body)
56 | 
57 |     else:
58 |         host = st.session_state.host_name
59 |         resp = requests.post(
60 |             API_ENDPOINT.format(
61 |                 HOST=host,
62 |             ),
63 |             json=request_body,
64 |             headers={
65 |                 "Authorization": f'Snowflake Token="{_conn.rest.token}"',  # type: ignore[union-attr]
66 |                 "Content-Type": "application/json",
67 |             },
68 |         )
69 |         if resp.status_code < 400:
70 |             json_resp: Dict[str, Any] = resp.json()
71 |             return json_resp
72 |         else:
73 |             err_body = json.loads(resp.text)
74 |             if "message" in err_body:
75 |                 # Certain errors have a message payload with a link to the github repo, which we should remove.
76 |                 error_msg = re.sub(
77 |                     r"\s*Please use https://github\.com/Snowflake-Labs/semantic-model-generator.*",
78 |                     "",
79 |                     err_body["message"],
80 |                 )
81 |                 raise ValueError(error_msg)
82 |             raise ValueError(err_body)
83 | 


--------------------------------------------------------------------------------
/app_utils/looker_sdk.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/app_utils/looker_sdk.zip


--------------------------------------------------------------------------------
/app_utils/strictyaml.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/app_utils/strictyaml.zip


--------------------------------------------------------------------------------
/artifacts/customers.yml:
--------------------------------------------------------------------------------
  1 | models:
  2 |   - name: customers
  3 |     description: Customer overview data mart, offering key details for each unique customer. One row per customer.
  4 |     data_tests:
  5 |       - dbt_utils.expression_is_true:
  6 |           expression: "lifetime_spend_pretax + lifetime_tax_paid = lifetime_spend"
  7 |     columns:
  8 |       - name: customer_id
  9 |         description: The unique key of the orders mart.
 10 |         data_tests:
 11 |           - not_null
 12 |           - unique
 13 |       - name: customer_name
 14 |         description: Customers' full name.
 15 |       - name: count_lifetime_orders
 16 |         description: Total number of orders a customer has ever placed.
 17 |       - name: first_ordered_at
 18 |         description: The timestamp when a customer placed their first order.
 19 |       - name: last_ordered_at
 20 |         description: The timestamp of a customer's most recent order.
 21 |       - name: lifetime_spend_pretax
 22 |         description: The sum of all the pre-tax subtotals of every order a customer has placed.
 23 |       - name: lifetime_tax_paid
 24 |         description: The sum of all the tax portion of every order a customer has placed.
 25 |       - name: lifetime_spend
 26 |         description: The sum of all the order totals (including tax) that a customer has ever placed.
 27 |       - name: customer_type
 28 |         description: Options are 'new' or 'returning', indicating if a customer has ordered more than once or has only placed their first order to date.
 29 |         data_tests:
 30 |           - accepted_values:
 31 |               values: ["new", "returning"]
 32 | 
 33 | semantic_models:
 34 |   - name: customers
 35 |     defaults:
 36 |       agg_time_dimension: first_ordered_at
 37 |     description: |
 38 |       Customer grain mart.
 39 |     model: ref('customers')
 40 |     entities:
 41 |       - name: customer
 42 |         expr: customer_id
 43 |         type: primary
 44 |     dimensions:
 45 |       - name: customer_name
 46 |         type: categorical
 47 |       - name: customer_type
 48 |         type: categorical
 49 |       - name: first_ordered_at
 50 |         type: time
 51 |         type_params:
 52 |           time_granularity: day
 53 |       - name: last_ordered_at
 54 |         type: time
 55 |         type_params:
 56 |           time_granularity: day
 57 |     measures:
 58 |       - name: count_lifetime_orders
 59 |         description: Total count of orders per customer.
 60 |         agg: sum
 61 |       - name: lifetime_spend_pretax
 62 |         description: Customer lifetime spend before taxes.
 63 |         agg: sum
 64 |       - name: lifetime_spend
 65 |         agg: sum
 66 |         description: Gross customer lifetime spend inclusive of taxes.
 67 | 
 68 | metrics:
 69 |   - name: lifetime_spend_pretax
 70 |     description: Customer's lifetime spend before tax
 71 |     label: LTV Pre-tax
 72 |     type: simple
 73 |     type_params:
 74 |       measure: lifetime_spend_pretax
 75 |   - name: count_lifetime_orders
 76 |     description: Count of lifetime orders
 77 |     label: Count Lifetime Orders
 78 |     type: simple
 79 |     type_params:
 80 |       measure: count_lifetime_orders
 81 |   - name: average_order_value
 82 |     description: LTV pre-tax / number of orders
 83 |     label: Average Order Value
 84 |     type: derived
 85 |     type_params:
 86 |       metrics:
 87 |         - count_lifetime_orders
 88 |         - lifetime_spend_pretax
 89 |       expr: lifetime_spend_pretax / count_lifetime_orders
 90 | 
 91 | saved_queries:
 92 |   - name: customer_order_metrics
 93 |     query_params:
 94 |       metrics:
 95 |         - count_lifetime_orders
 96 |         - lifetime_spend_pretax
 97 |         - average_order_value
 98 |       group_by:
 99 |         - Entity('customer')
100 |     exports:
101 |       - name: customer_order_metrics
102 |         config:
103 |           export_as: table
104 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: sf_env
 2 | channels:
 3 | - snowflake
 4 | dependencies:
 5 | - python=3.10.*
 6 | - pandas=2.2.2
 7 | - tqdm=4.66.5
 8 | - streamlit=1.35.0
 9 | - loguru=0.5.3
10 | - protobuf=3.20.3
11 | - pydantic=2.8.2
12 | - pyyaml=6.0.1
13 | - ruamel.yaml=0.17.21
14 | - pyarrow=14.0.2
15 | - sqlglot=25.10.0
16 | - numpy=1.26.4
17 | - python-dotenv=0.21.0
18 | - urllib3=2.2.2
19 | - requests=2.32.3
20 | - types-pyyaml=6.0.12.12
21 | - types-protobuf=4.25.0.20240417
22 | - snowflake-snowpark-python=1.18.0
23 | - streamlit-extras=0.4.0
24 | - cattrs=23.1.2


--------------------------------------------------------------------------------
/images/dbt-signature_tm_black.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/images/dbt-signature_tm_black.png


--------------------------------------------------------------------------------
/images/error39.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/images/error39.png


--------------------------------------------------------------------------------
/images/looker.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/images/looker.png


--------------------------------------------------------------------------------
/journeys/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/journeys/__init__.py


--------------------------------------------------------------------------------
/journeys/builder.py:
--------------------------------------------------------------------------------
  1 | import streamlit as st
  2 | from loguru import logger
  3 | from snowflake.connector import ProgrammingError
  4 | 
  5 | from app_utils.shared_utils import (
  6 |     GeneratorAppScreen,
  7 |     format_snowflake_context,
  8 |     get_available_databases,
  9 |     get_available_schemas,
 10 |     get_available_tables,
 11 |     input_sample_value_num,
 12 |     input_semantic_file_name,
 13 |     run_generate_model_str_from_snowflake,
 14 | )
 15 | 
 16 | 
 17 | def update_schemas_and_tables() -> None:
 18 |     """
 19 |     Callback to run when the selected databases change. Ensures that if a database is deselected, the corresponding
 20 |     schemas and tables are also deselected.
 21 |     Returns: None
 22 | 
 23 |     """
 24 |     databases = st.session_state["selected_databases"]
 25 | 
 26 |     # Fetch the available schemas for the selected databases
 27 |     schemas = []
 28 |     for db in databases:
 29 |         try:
 30 |             schemas.extend(get_available_schemas(db))
 31 |         except ProgrammingError:
 32 |             logger.info(
 33 |                 f"Insufficient permissions to read from database {db}, skipping"
 34 |             )
 35 | 
 36 |     st.session_state["available_schemas"] = schemas
 37 | 
 38 |     # Enforce that the previously selected schemas are still valid
 39 |     valid_selected_schemas = [
 40 |         schema for schema in st.session_state["selected_schemas"] if schema in schemas
 41 |     ]
 42 |     st.session_state["selected_schemas"] = valid_selected_schemas
 43 |     update_tables()
 44 | 
 45 | 
 46 | def update_tables() -> None:
 47 |     """
 48 |     Callback to run when the selected schemas change. Ensures that if a schema is deselected, the corresponding
 49 |     tables are also deselected.
 50 |     """
 51 |     schemas = st.session_state["selected_schemas"]
 52 | 
 53 |     # Fetch the available tables for the selected schemas
 54 |     tables = []
 55 |     for schema in schemas:
 56 |         try:
 57 |             tables.extend(get_available_tables(schema))
 58 |         except ProgrammingError:
 59 |             logger.info(
 60 |                 f"Insufficient permissions to read from schema {schema}, skipping"
 61 |             )
 62 |     st.session_state["available_tables"] = tables
 63 | 
 64 |     # Enforce that the previously selected tables are still valid
 65 |     valid_selected_tables = [
 66 |         table for table in st.session_state["selected_tables"] if table in tables
 67 |     ]
 68 |     st.session_state["selected_tables"] = valid_selected_tables
 69 | 
 70 | 
 71 | @st.experimental_dialog("Selecting your tables", width="large")
 72 | def table_selector_dialog() -> None:
 73 |     st.write(
 74 |         "Please fill out the following fields to start building your semantic model."
 75 |     )
 76 |     model_name = input_semantic_file_name()
 77 |     sample_values = input_sample_value_num()
 78 |     st.markdown("")
 79 | 
 80 |     if "selected_databases" not in st.session_state:
 81 |         st.session_state["selected_databases"] = []
 82 | 
 83 |     if "selected_schemas" not in st.session_state:
 84 |         st.session_state["selected_schemas"] = []
 85 | 
 86 |     if "selected_tables" not in st.session_state:
 87 |         st.session_state["selected_tables"] = []
 88 | 
 89 |     with st.spinner("Loading databases..."):
 90 |         available_databases = get_available_databases()
 91 | 
 92 |     st.multiselect(
 93 |         label="Databases",
 94 |         options=available_databases,
 95 |         placeholder="Select the databases that contain the tables you'd like to include in your semantic model.",
 96 |         on_change=update_schemas_and_tables,
 97 |         key="selected_databases",
 98 |         # default=st.session_state.get("selected_databases", []),
 99 |     )
100 | 
101 |     st.multiselect(
102 |         label="Schemas",
103 |         options=st.session_state.get("available_schemas", []),
104 |         placeholder="Select the schemas that contain the tables you'd like to include in your semantic model.",
105 |         on_change=update_tables,
106 |         key="selected_schemas",
107 |         format_func=lambda x: format_snowflake_context(x, -1),
108 |     )
109 | 
110 |     st.multiselect(
111 |         label="Tables",
112 |         options=st.session_state.get("available_tables", []),
113 |         placeholder="Select the tables you'd like to include in your semantic model.",
114 |         key="selected_tables",
115 |         format_func=lambda x: format_snowflake_context(x, -1),
116 |     )
117 | 
118 |     st.markdown("<div style='margin: 240px;'></div>", unsafe_allow_html=True)
119 |     experimental_features = st.checkbox(
120 |         "Enable joins (optional)",
121 |         help="Checking this box will enable you to add/edit join paths in your semantic model. If enabling this setting, please ensure that you have the proper parameters set on your Snowflake account. Reach out to your account team for access.",
122 |     )
123 | 
124 |     st.session_state["experimental_features"] = experimental_features
125 | 
126 |     submit = st.button("Submit", use_container_width=True, type="primary")
127 |     if submit:
128 |         try:
129 |             run_generate_model_str_from_snowflake(
130 |                 model_name,
131 |                 sample_values,
132 |                 st.session_state["selected_tables"],
133 |                 allow_joins=experimental_features,
134 |             )
135 |             st.session_state["page"] = GeneratorAppScreen.ITERATION
136 |             st.rerun()
137 |         except ValueError as e:
138 |             st.error(e)
139 | 
140 | 
141 | def show() -> None:
142 |     table_selector_dialog()
143 | 


--------------------------------------------------------------------------------
/journeys/joins.py:
--------------------------------------------------------------------------------
  1 | from typing import Optional
  2 | 
  3 | import streamlit as st
  4 | from streamlit_extras.row import row
  5 | 
  6 | from app_utils.shared_utils import get_snowflake_connection
  7 | from semantic_model_generator.data_processing.cte_utils import (
  8 |     fully_qualified_table_name,
  9 | )
 10 | from semantic_model_generator.protos import semantic_model_pb2
 11 | from semantic_model_generator.snowflake_utils.snowflake_connector import (
 12 |     get_table_primary_keys,
 13 | )
 14 | 
 15 | SUPPORTED_JOIN_TYPES = [
 16 |     join_type
 17 |     for join_type in semantic_model_pb2.JoinType.values()
 18 |     if join_type != semantic_model_pb2.JoinType.join_type_unknown
 19 | ]
 20 | SUPPORTED_RELATIONSHIP_TYPES = [
 21 |     relationship_type
 22 |     for relationship_type in semantic_model_pb2.RelationshipType.values()
 23 |     if relationship_type
 24 |     != semantic_model_pb2.RelationshipType.relationship_type_unknown
 25 | ]
 26 | 
 27 | 
 28 | def relationship_builder(
 29 |     relationship: semantic_model_pb2.Relationship, key: Optional[int] = 0
 30 | ) -> None:
 31 |     """
 32 |     Renders a UI for building/editing a semantic model relationship.
 33 |     Args:
 34 |         relationship: The relationship object to edit.
 35 | 
 36 |     Returns:
 37 | 
 38 |     """
 39 |     with st.expander(
 40 |         relationship.name or f"{relationship.left_table} ↔️ {relationship.right_table}",
 41 |         expanded=True,
 42 |     ):
 43 |         relationship.name = st.text_input(
 44 |             "Name", value=relationship.name, key=f"name_{key}"
 45 |         )
 46 |         # Logic to preselect the tables in the dropdown based on what's in the semantic model.
 47 |         try:
 48 |             default_left_table = [
 49 |                 table.name for table in st.session_state.semantic_model.tables
 50 |             ].index(relationship.left_table)
 51 |             default_right_table = [
 52 |                 table.name for table in st.session_state.semantic_model.tables
 53 |             ].index(relationship.right_table)
 54 |         except ValueError:
 55 |             default_left_table = 0
 56 |             default_right_table = 0
 57 |         relationship.left_table = st.selectbox(
 58 |             "Left Table",
 59 |             options=[table.name for table in st.session_state.semantic_model.tables],
 60 |             index=default_left_table,
 61 |             key=f"left_table_{key}",
 62 |         )
 63 | 
 64 |         relationship.right_table = st.selectbox(
 65 |             "Right Table",
 66 |             options=[table.name for table in st.session_state.semantic_model.tables],
 67 |             index=default_right_table,
 68 |             key=f"right_table_{key}",
 69 |         )
 70 | 
 71 |         relationship.join_type = st.radio(  # type: ignore
 72 |             "Join Type",
 73 |             options=SUPPORTED_JOIN_TYPES,
 74 |             format_func=lambda join_type: semantic_model_pb2.JoinType.Name(join_type),
 75 |             index=SUPPORTED_JOIN_TYPES.index(relationship.join_type),
 76 |             key=f"join_type_{key}",
 77 |         )
 78 | 
 79 |         relationship.relationship_type = st.radio(  # type: ignore
 80 |             "Relationship Type",
 81 |             options=SUPPORTED_RELATIONSHIP_TYPES,
 82 |             format_func=lambda relationship_type: semantic_model_pb2.RelationshipType.Name(
 83 |                 relationship_type
 84 |             ),
 85 |             index=SUPPORTED_RELATIONSHIP_TYPES.index(relationship.relationship_type),
 86 |             key=f"relationship_type_{key}",
 87 |         )
 88 | 
 89 |         st.divider()
 90 |         # Builder section for the relationship's columns.
 91 |         for col_idx, join_cols in enumerate(relationship.relationship_columns):
 92 |             # Grabbing references to the exact Table objects that the relationship is pointing to.
 93 |             # This allows us to pull the columns.
 94 |             left_table_object = next(
 95 |                 (
 96 |                     table
 97 |                     for table in st.session_state.semantic_model.tables
 98 |                     if table.name == relationship.left_table
 99 |                 )
100 |             )
101 |             right_table_object = next(
102 |                 (
103 |                     table
104 |                     for table in st.session_state.semantic_model.tables
105 |                     if table.name == relationship.right_table
106 |                 )
107 |             )
108 | 
109 |             try:
110 |                 left_columns = []
111 |                 left_columns.extend(left_table_object.columns)
112 |                 left_columns.extend(left_table_object.dimensions)
113 |                 left_columns.extend(left_table_object.time_dimensions)
114 |                 left_columns.extend(left_table_object.measures)
115 | 
116 |                 right_columns = []
117 |                 right_columns.extend(right_table_object.columns)
118 |                 right_columns.extend(right_table_object.dimensions)
119 |                 right_columns.extend(right_table_object.time_dimensions)
120 |                 right_columns.extend(right_table_object.measures)
121 | 
122 |                 default_left_col = [col.name for col in left_columns].index(
123 |                     join_cols.left_column
124 |                 )
125 |                 default_right_col = [col.name for col in right_columns].index(
126 |                     join_cols.right_column
127 |                 )
128 |             except ValueError:
129 |                 default_left_col = 0
130 |                 default_right_col = 0
131 | 
132 |             join_cols.left_column = st.selectbox(
133 |                 "Left Column",
134 |                 options=[col.name for col in left_columns],
135 |                 index=default_left_col,
136 |                 key=f"left_col_{key}_{col_idx}",
137 |             )
138 |             join_cols.right_column = st.selectbox(
139 |                 "Right Column",
140 |                 options=[col.name for col in right_columns],
141 |                 index=default_right_col,
142 |                 key=f"right_col_{key}_{col_idx}",
143 |             )
144 | 
145 |             if st.button("Delete join key", key=f"delete_join_key_{key}_{col_idx}"):
146 |                 relationship.relationship_columns.pop(col_idx)
147 |                 st.rerun()
148 | 
149 |             st.divider()
150 | 
151 |         join_editor_row = row(2, vertical_align="center")
152 |         if join_editor_row.button(
153 |             "Add new join key",
154 |             key=f"add_join_keys_{key}",
155 |             use_container_width=True,
156 |             type="primary",
157 |         ):
158 |             relationship.relationship_columns.append(
159 |                 semantic_model_pb2.RelationKey(
160 |                     left_column="",
161 |                     right_column="",
162 |                 )
163 |             )
164 |             st.rerun()
165 | 
166 |         if join_editor_row.button(
167 |             "🗑️ Delete join path",
168 |             key=f"delete_join_path_{key}",
169 |             use_container_width=True,
170 |         ):
171 |             st.session_state.builder_joins.pop(key)
172 |             st.rerun()
173 | 
174 | 
175 | @st.experimental_dialog("Join Builder", width="large")
176 | def joins_dialog() -> None:
177 |     if "builder_joins" not in st.session_state:
178 |         # Making a copy of the original relationships list so we can modify freely without affecting the original.
179 |         st.session_state.builder_joins = st.session_state.semantic_model.relationships[
180 |             :
181 |         ]
182 | 
183 |     for idx, relationship in enumerate(st.session_state.builder_joins):
184 |         relationship_builder(relationship, idx)
185 | 
186 |     # If the user clicks "Add join", add a new join to the relationships list
187 |     if st.button("Add new join path", use_container_width=True):
188 |         st.session_state.builder_joins.append(
189 |             semantic_model_pb2.Relationship(
190 |                 left_table="",
191 |                 right_table="",
192 |                 join_type=semantic_model_pb2.JoinType.inner,
193 |                 relationship_type=semantic_model_pb2.RelationshipType.one_to_one,
194 |                 relationship_columns=[],
195 |             )
196 |         )
197 |         st.rerun()
198 | 
199 |     # If the user clicks "Save", save the relationships list to the session state
200 |     if st.button("Save to semantic model", use_container_width=True, type="primary"):
201 |         # Quickly validate that all of the user's joins have the required fields.
202 |         for relationship in st.session_state.builder_joins:
203 |             if not relationship.left_table or not relationship.right_table:
204 |                 st.error("Please fill out left and right tables for all join paths.")
205 |                 return
206 | 
207 |             if not relationship.name:
208 |                 st.error(
209 |                     f"The join path between {relationship.left_table} and {relationship.right_table} is missing a name."
210 |                 )
211 |                 return
212 | 
213 |             if not relationship.relationship_columns:
214 |                 st.error(
215 |                     f"The join path between {relationship.left_table} and {relationship.right_table} is missing joinable columns."
216 |                 )
217 |                 return
218 | 
219 |             # Populate primary key information for each table in a join relationship.
220 |             left_table_object = next(
221 |                 (
222 |                     table
223 |                     for table in st.session_state.semantic_model.tables
224 |                     if table.name == relationship.left_table
225 |                 )
226 |             )
227 |             right_table_object = next(
228 |                 (
229 |                     table
230 |                     for table in st.session_state.semantic_model.tables
231 |                     if table.name == relationship.right_table
232 |                 )
233 |             )
234 | 
235 |             with st.spinner("Fetching primary keys..."):
236 |                 if not left_table_object.primary_key.columns:
237 |                     primary_keys = get_table_primary_keys(
238 |                         get_snowflake_connection(),
239 |                         table_fqn=fully_qualified_table_name(
240 |                             left_table_object.base_table
241 |                         ),
242 |                     )
243 |                     left_table_object.primary_key.columns.extend(primary_keys or [""])
244 | 
245 |                 if not right_table_object.primary_key.columns:
246 |                     primary_keys = get_table_primary_keys(
247 |                         get_snowflake_connection(),
248 |                         table_fqn=fully_qualified_table_name(
249 |                             right_table_object.base_table
250 |                         ),
251 |                     )
252 |                     right_table_object.primary_key.columns.extend(primary_keys or [""])
253 | 
254 |         del st.session_state.semantic_model.relationships[:]
255 |         st.session_state.semantic_model.relationships.extend(
256 |             st.session_state.builder_joins
257 |         )
258 |         st.session_state.validated = None
259 |         st.session_state["join_dialog_open"] = False
260 |         st.rerun()
261 | 


--------------------------------------------------------------------------------
/journeys/partner.py:
--------------------------------------------------------------------------------
 1 | import streamlit as st
 2 | 
 3 | 
 4 | @st.experimental_dialog("Partner Semantic Support", width="large")
 5 | def partner_semantic_setup() -> None:
 6 |     """
 7 |     Renders the partner semantic setup dialog with instructions.
 8 |     """
 9 |     from partner.partner_utils import configure_partner_semantic
10 | 
11 |     st.write(
12 |         """
13 |         Have an existing semantic layer in a partner tool that's integrated with Snowflake?
14 |         See the below instructions for integrating your partner semantic specs into Cortex Analyst's semantic file.
15 |         """
16 |     )
17 |     configure_partner_semantic()
18 | 
19 | 
20 | def show() -> None:
21 |     """
22 |     Runs partner setup dialog.
23 |     """
24 |     partner_semantic_setup()
25 | 


--------------------------------------------------------------------------------
/mypy.ini:
--------------------------------------------------------------------------------
 1 | [mypy]
 2 | plugins = pydantic.mypy
 3 | 
 4 | ignore_missing_imports = True
 5 | strict = True
 6 | disallow_untyped_defs = True
 7 | warn_unused_ignores = False
 8 | disallow_any_generics = True
 9 | 
10 | exclude = .venv|venv|pyvenv|(_test\.py|test_.*\.py)|_pb2\.py|_pb2\.pyi|admin_app/streamlit_app.py
11 | 
12 | [mypy-semantic_model_generator.protos.semantic_model_pb2]
13 | ignore_errors = True
14 | 
15 | [mypy-requests]
16 | ignore_missing_imports = True
17 | 


--------------------------------------------------------------------------------
/partner/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/partner/__init__.py


--------------------------------------------------------------------------------
/partner/cortex.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Optional
  2 | 
  3 | import pandas as pd
  4 | import streamlit as st
  5 | 
  6 | from semantic_model_generator.data_processing.proto_utils import (
  7 |     proto_to_dict,
  8 |     yaml_to_semantic_model,
  9 | )
 10 | 
 11 | 
 12 | class CortexDimension:
 13 |     """
 14 |     Class for Cortex dimension-type field.
 15 |     """
 16 | 
 17 |     def __init__(self, data: dict[str, Any]):
 18 | 
 19 |         self.data: dict[str, Any] = data
 20 |         self.name: str = data["name"]
 21 |         self.synonyms: Optional[list[str]] = data.get("synonyms", None)
 22 |         self.data_type: str = data.get("data_type", "TEXT")
 23 |         self.expr: str = data["expr"]
 24 |         self.description: Optional[str] = data.get("description", None)
 25 |         self.sample_values: Optional[list[str]] = data.get("sample_values", None)
 26 |         self.unique: bool = data.get("unique", False)
 27 | 
 28 |     def get_name(self) -> str:
 29 |         return self.name
 30 | 
 31 |     def get_data(self) -> dict[str, Any]:
 32 |         return self.data
 33 | 
 34 |     def get_cortex_type(self) -> str:
 35 |         return self.data_type
 36 | 
 37 |     def get_description(self) -> Optional[str]:
 38 |         return self.description
 39 | 
 40 |     def set_description(self, value: str) -> None:
 41 |         self.description = value
 42 | 
 43 |     def get_cortex_section(self) -> str:
 44 |         return "dimensions"
 45 | 
 46 |     def get_key(self) -> str:
 47 |         return self.expr.upper()
 48 | 
 49 |     def get_cortex_details(self) -> dict[str, Any]:
 50 |         """
 51 |         Used in static methods in partner classes to retrieve and modify Cortex-equivalent details
 52 |         """
 53 |         return self.data
 54 | 
 55 |     def get_cortex_comparison_dict(self) -> dict[str, Any]:
 56 |         return {
 57 |             "field_key": self.get_key(),
 58 |             "section": self.get_cortex_section(),
 59 |             "field_details": self.get_cortex_details(),
 60 |         }
 61 | 
 62 | 
 63 | class CortexTimeDimension(CortexDimension):
 64 |     """
 65 |     Class for Cortex time dimension-type field.
 66 |     """
 67 | 
 68 |     def get_cortex_section(self) -> str:
 69 |         return "time_dimensions"
 70 | 
 71 | 
 72 | class CortexMeasure(CortexDimension):
 73 |     """
 74 |     Class for Cortex measure-type field.
 75 |     """
 76 | 
 77 |     def __init__(self, data: dict[str, Any]):
 78 |         super().__init__(data)
 79 |         self.default_aggregation = data.get("default_aggregation", None)
 80 | 
 81 |     def get_cortex_section(self) -> str:
 82 |         return "measures"
 83 | 
 84 | 
 85 | class CortexSemanticTable:
 86 |     """
 87 |     Class for single Cortex logical table in semantic file.
 88 |     """
 89 | 
 90 |     def __init__(self, data: dict[str, Any]):
 91 |         self.data: dict[str, Any] = data
 92 |         self.name: str = data["name"]
 93 |         self.description: Optional[str] = data["description"]
 94 |         self.base_table_db: str = data["base_table"]["database"]
 95 |         self.base_table_schema: str = data["base_table"]["schema"]
 96 |         self.base_table_table: str = data["base_table"]["table"]
 97 |         self.dimensions: Optional[list[dict[str, Any]]] = data["dimensions"]
 98 |         self.time_dimensions: Optional[list[dict[str, Any]]] = data["time_dimensions"]
 99 |         self.measures: Optional[list[dict[str, Any]]] = data["measures"]
100 | 
101 |     def get_data(self) -> dict[str, Any]:
102 |         return self.data
103 | 
104 |     def get_name(self) -> str:
105 |         return self.name
106 | 
107 |     def get_description(self) -> Optional[str]:
108 |         return self.description
109 | 
110 |     def get_cortex_fields(self) -> list[dict[str, Any]]:
111 |         """
112 |         Processes and returns raw field data as vendor-specific field objects.
113 |         """
114 | 
115 |         cortex_fields = []
116 |         if self.dimensions:
117 |             for dimension in self.dimensions:
118 |                 cortex_fields.append(
119 |                     CortexDimension(dimension).get_cortex_comparison_dict()
120 |                 )
121 |         if self.time_dimensions:
122 |             for time_dimension in self.time_dimensions:
123 |                 cortex_fields.append(
124 |                     CortexTimeDimension(time_dimension).get_cortex_comparison_dict()
125 |                 )
126 |         if self.measures:
127 |             for measure in self.measures:
128 |                 cortex_fields.append(
129 |                     CortexMeasure(measure).get_cortex_comparison_dict()
130 |                 )
131 | 
132 |         return cortex_fields
133 | 
134 |     def create_comparison_df(self) -> pd.DataFrame:
135 |         cortex_fields = self.get_cortex_fields()
136 |         return pd.DataFrame(cortex_fields)
137 | 
138 |     @staticmethod
139 |     def create_cortex_table_list() -> None:
140 |         cortex_semantic = proto_to_dict(
141 |             yaml_to_semantic_model(st.session_state["last_saved_yaml"])
142 |         )
143 |         # Need to replace table details in current entire yaml
144 |         st.session_state["current_yaml_as_dict"] = cortex_semantic
145 |         tables = []
146 |         for table in cortex_semantic["tables"]:
147 |             tables.append(CortexSemanticTable(table))
148 |         st.session_state["cortex_comparison_tables"] = tables
149 | 
150 |     @staticmethod
151 |     def retrieve_df_by_name(name: str) -> pd.DataFrame:
152 |         for table in st.session_state["cortex_comparison_tables"]:
153 |             if table.get_name() == name:
154 |                 return table.create_comparison_df()
155 | 


--------------------------------------------------------------------------------
/partner/dbt.py:
--------------------------------------------------------------------------------
  1 | from typing import Any, Optional, Union
  2 | 
  3 | import pandas as pd
  4 | import streamlit as st
  5 | import yaml
  6 | from snowflake.connector import ProgrammingError
  7 | 
  8 | from app_utils.shared_utils import (
  9 |     download_yaml,
 10 |     get_snowflake_connection,
 11 |     get_yamls_from_stage,
 12 |     set_sit_query_tag,
 13 |     stage_selector_container,
 14 | )
 15 | 
 16 | # Partner semantic support instructions
 17 | DBT_IMAGE = "images/dbt-signature_tm_black.png"
 18 | DBT_MODEL_INSTRUCTIONS = """
 19 | ### [SQL Model](https://docs.getdbt.com/docs/build/sql-models)
 20 | 
 21 | Materialize your SQL model(s) as Snowflake table(s) and generate a Cortex Analyst semantic file for them directly.
 22 | > Steps:
 23 | > 1) Update dbt model(s) to be [materialized](https://docs.getdbt.com/docs/build/materializations) in Snowflake.
 24 | > 2) Update dbt model(s) to [persist docs](https://docs.getdbt.com/reference/resource-configs/persist_docs) to capture table/column descriptions.
 25 | > 3) Run dbt model(s) to materialize in Snowflake.
 26 | > 4) Select **🛠 Create a new semantic model** on the homepage and select the materialized Snowflake table(s).
 27 | """
 28 | DBT_SEMANTIC_INSTRUCTIONS = """
 29 | ### [Semantic Model](https://docs.getdbt.com/docs/build/semantic-models)
 30 | 
 31 | We extract metadata from your dbt semantic yaml file(s) and merge it with a generated Cortex Analyst semantic file.
 32 | 
 33 | **Note**: The DBT semantic layer must be sourced from tables/views in Snowflake.
 34 | If using Streamlit in Snowflake, upload dbt semantic (yaml/yml) file(s) to Snowflake stage first.
 35 | 
 36 | > Steps:
 37 | > 1) Select your dbt semantic (yaml/yml) file(s) below from stage or upload directly if not using Streamlit in Snowflake.
 38 | > 2) Select **🛠 Create a new semantic model** to generate a new Cortex Analyst semantic file for Snowflake tables or **✏️ Edit an existing semantic model**.
 39 | > 3) Validate the output in the UI.
 40 | > 4) Once you've validated the semantic file, click **Partner Semantic** to merge DBT and Cortex Analyst semantic files.
 41 | """
 42 | 
 43 | 
 44 | def upload_dbt_semantic() -> None:
 45 |     """
 46 |     Upload semantic file(s) for dbt from local source.
 47 | 
 48 |     Returns: None
 49 |     """
 50 |     uploaded_files = []
 51 |     if st.session_state["sis"]:
 52 |         stage_selector_container()
 53 |         # Based on the currently selected stage, show a dropdown of YAML files for the user to pick from.
 54 |         available_files = []
 55 |         if (
 56 |             "selected_iteration_stage" in st.session_state
 57 |             and st.session_state["selected_iteration_stage"]
 58 |         ):
 59 |             try:
 60 |                 available_files = get_yamls_from_stage(
 61 |                     st.session_state["selected_iteration_stage"],
 62 |                     include_yml=True,
 63 |                 )
 64 |             except (ValueError, ProgrammingError):
 65 |                 st.error("Insufficient permissions to read from the selected stage.")
 66 |                 st.stop()
 67 | 
 68 |         stage_files = st.multiselect("Staged files", options=available_files)
 69 |         if stage_files:
 70 |             for staged_file in stage_files:
 71 |                 file_content = download_yaml(
 72 |                     staged_file, st.session_state["selected_iteration_stage"]
 73 |                 )
 74 |                 uploaded_files.append(file_content)
 75 |     else:
 76 |         uploaded_files = st.file_uploader(  # type: ignore
 77 |             f'Upload {st.session_state["partner_tool"]} semantic yaml file(s)',
 78 |             type=["yaml", "yml"],
 79 |             accept_multiple_files=True,
 80 |             key="dbt_files",
 81 |         )
 82 |     if uploaded_files:
 83 |         partner_semantic: list[Union[None, DBTSemanticModel]] = []
 84 |         for file in uploaded_files:
 85 |             partner_semantic.extend(read_dbt_yaml(file))  # type: ignore
 86 | 
 87 |         if not partner_semantic:
 88 |             st.error("Upload file(s) do not contain required semantic_models section.")
 89 |         else:
 90 |             st.session_state["partner_semantic"] = partner_semantic
 91 |         if st.button("Continue", type="primary"):
 92 |             st.session_state["partner_setup"] = True
 93 |             set_sit_query_tag(
 94 |                 get_snowflake_connection(),
 95 |                 vendor="dbt",
 96 |                 action="setup_complete",
 97 |             )
 98 |             st.rerun()
 99 |     else:
100 |         st.session_state["partner_semantic"] = None
101 | 
102 | 
103 | class DBTEntity:
104 |     """
105 |     Class for dbt entity-type field.
106 |     """
107 | 
108 |     def __init__(self, entity: dict[str, Any]):
109 | 
110 |         self.entity: dict[str, Any] = entity
111 |         self.name: str = entity["name"]
112 |         self.type: str = entity.get("type", None)
113 |         self.expr: str = entity.get("expr", self.name)
114 |         self.description: Optional[str] = entity.get("description", None)
115 |         self.cortex_map = {
116 |             "name": self.name,
117 |             "description": self.description,
118 |             "expr": self.expr,
119 |             "data_type": self.get_cortex_type(),
120 |         }
121 | 
122 |     def get_data(self) -> dict[str, Any]:
123 |         return self.entity
124 | 
125 |     def get_cortex_type(self) -> str:
126 |         return "TEXT"
127 | 
128 |     def get_cortex_section(self) -> str:
129 |         return "dimensions"
130 | 
131 |     def get_key(self) -> str:
132 |         return self.expr.upper()
133 | 
134 |     def get_cortex_details(self) -> dict[str, Any]:
135 |         return_details = {}
136 |         for k, v in self.cortex_map.items():
137 |             if v is not None:
138 |                 return_details[k] = v
139 |         return return_details
140 | 
141 |     def get_cortex_comparison_dict(self) -> dict[str, Any]:
142 |         return {
143 |             "field_key": self.get_key(),
144 |             "section": self.get_cortex_section(),
145 |             "field_details": self.get_cortex_details(),
146 |         }
147 | 
148 | 
149 | class DBTMeasure(DBTEntity):
150 |     """
151 |     Class for dbt measure-type field.
152 |     """
153 | 
154 |     def __init__(self, entity: dict[str, Any]):
155 |         super().__init__(entity)
156 |         self.agg: Optional[str] = entity.get("agg", None)
157 |         self.cortex_map = {
158 |             "name": self.name,
159 |             "description": self.description,
160 |             "expr": self.expr,
161 |             "data_type": self.get_cortex_type(),
162 |             "default_aggregation": self.agg,
163 |         }
164 | 
165 |     def get_cortex_type(self) -> str:
166 |         return "NUMBER"
167 | 
168 |     def get_cortex_section(self) -> str:
169 |         return "measures"
170 | 
171 | 
172 | class DBTDimension(DBTEntity):
173 |     """
174 |     Class for dbt dimension-type field.
175 |     """
176 | 
177 |     def get_cortex_type(self) -> str:
178 |         if self.type == "time":
179 |             return "DATETIME"
180 |         else:
181 |             return "TEXT"
182 | 
183 |     def get_cortex_section(self) -> str:
184 |         if self.type == "time":
185 |             return "time_dimensions"
186 |         else:
187 |             return "dimensions"
188 | 
189 | 
190 | class DBTSemanticModel:
191 |     """
192 |     Class for single DBT semantic model.
193 |     """
194 | 
195 |     def __init__(self, data: dict[str, Any]):
196 |         self.data: dict[str, Any] = data
197 |         self.name: str = data["name"]
198 |         self.description: Optional[str] = data.get("description", None)
199 |         self.entities: Optional[list[dict[str, Any]]] = data["entities"]
200 |         self.dimensions: Optional[list[dict[str, Any]]] = data["dimensions"]
201 |         self.measures: Optional[list[dict[str, Any]]] = data["measures"]
202 | 
203 |     def get_data(self) -> dict[str, Any]:
204 |         return self.data
205 | 
206 |     def get_name(self) -> str:
207 |         return self.name
208 | 
209 |     def get_description(self) -> Optional[str]:
210 |         return self.description
211 | 
212 |     def get_cortex_fields(self) -> list[dict[str, Any]]:
213 |         cortex_fields = []
214 |         if self.entities:
215 |             for entity in self.entities:
216 |                 cortex_fields.append(DBTEntity(entity).get_cortex_comparison_dict())
217 |         if self.measures:
218 |             for measure in self.measures:
219 |                 cortex_fields.append(DBTMeasure(measure).get_cortex_comparison_dict())
220 |         if self.dimensions:
221 |             for dimension in self.dimensions:
222 |                 cortex_fields.append(
223 |                     DBTDimension(dimension).get_cortex_comparison_dict()
224 |                 )
225 | 
226 |         return cortex_fields
227 | 
228 |     def create_comparison_df(self) -> pd.DataFrame:
229 |         cortex_fields = self.get_cortex_fields()
230 |         return pd.DataFrame(cortex_fields)
231 | 
232 |     @staticmethod
233 |     def retrieve_df_by_name(name: str) -> pd.DataFrame:
234 |         for model in st.session_state["partner_semantic"]:
235 |             if model.get_name() == name:
236 |                 return model.create_comparison_df()
237 | 
238 | 
239 | def read_dbt_yaml(file_path: str) -> list[DBTSemanticModel]:
240 |     """
241 |     Reads file uploads and extracts dbt semantic files in list.
242 |     Args:
243 |         file_path (str): Local file path uploaded by user.
244 | 
245 |     Returns: None | list[DBTSemanticModel]
246 |     """
247 | 
248 |     data = yaml.safe_load(file_path)
249 |     dbt_semantic_models = []
250 |     if "semantic_models" in data:
251 |         # dbt_semantic_models = []
252 |         for semantic_model in data["semantic_models"]:
253 |             dbt_semantic_models.append(DBTSemanticModel(semantic_model))
254 |     else:
255 |         st.warning(f"{file_path} does not contain semantic_models section. Skipping.")
256 |     return dbt_semantic_models
257 | 


--------------------------------------------------------------------------------
/partner/partner_utils.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import time
  3 | from enum import Enum
  4 | from typing import Any, Union
  5 | 
  6 | import numpy as np
  7 | import pandas as pd
  8 | import streamlit as st
  9 | import yaml
 10 | 
 11 | from app_utils.shared_utils import (
 12 |     get_snowflake_connection,
 13 |     render_image,
 14 |     set_sit_query_tag,
 15 | )
 16 | from partner.cortex import CortexSemanticTable
 17 | from partner.dbt import DBTSemanticModel, upload_dbt_semantic
 18 | from semantic_model_generator.data_processing.proto_utils import yaml_to_semantic_model
 19 | 
 20 | 
 21 | class PartnerTool(Enum):
 22 |     DBT_SQL_MODEL = "dbt - SQL Model"
 23 |     DBT_SEMANTIC_MODEL = "dbt - Semantic Model"
 24 |     LOOKER_EXPLORE = "Looker - Explore"
 25 | 
 26 | 
 27 | def set_partner_instructions() -> None:
 28 |     """
 29 |     Sets instructions and partner logo in session_state based on selected partner.
 30 |     Returns: None
 31 |     """
 32 | 
 33 |     if st.session_state.get("partner_tool", None):
 34 |         if st.session_state["partner_tool"] == PartnerTool.DBT_SQL_MODEL.value:
 35 |             from partner.dbt import DBT_IMAGE, DBT_MODEL_INSTRUCTIONS
 36 | 
 37 |             instructions = DBT_MODEL_INSTRUCTIONS
 38 |             image = DBT_IMAGE
 39 |             image_size = (72, 32)
 40 |         elif st.session_state["partner_tool"] == PartnerTool.DBT_SEMANTIC_MODEL.value:
 41 |             from partner.dbt import DBT_IMAGE, DBT_SEMANTIC_INSTRUCTIONS
 42 | 
 43 |             instructions = DBT_SEMANTIC_INSTRUCTIONS
 44 |             image = DBT_IMAGE
 45 |             image_size = (72, 32)
 46 |         elif st.session_state["partner_tool"] == PartnerTool.LOOKER_EXPLORE.value:
 47 |             from partner.looker import LOOKER_IMAGE, LOOKER_INSTRUCTIONS
 48 | 
 49 |             instructions = LOOKER_INSTRUCTIONS
 50 |             image = LOOKER_IMAGE
 51 |             image_size = (72, 72)
 52 |         st.session_state["partner_instructions"] = instructions
 53 |         st.session_state["partner_image"] = image
 54 |         st.session_state["partner_image_size"] = image_size
 55 | 
 56 | 
 57 | def configure_partner_semantic() -> None:
 58 |     """
 59 |     Upload semantic files from local source.
 60 |     Returns: None
 61 |     """
 62 | 
 63 |     partners = [tool.value for tool in PartnerTool]
 64 | 
 65 |     st.selectbox(
 66 |         "Select the partner tool",
 67 |         partners,
 68 |         index=None,
 69 |         key="partner_tool",
 70 |         on_change=set_partner_instructions(),  # type: ignore
 71 |     )
 72 |     if st.session_state.get("partner_tool", None):
 73 |         with st.expander(
 74 |             "Instructions",
 75 |             expanded=True,
 76 |         ):
 77 |             render_image(
 78 |                 st.session_state["partner_image"],
 79 |                 st.session_state["partner_image_size"],
 80 |             )
 81 |             st.write(st.session_state["partner_instructions"])
 82 | 
 83 |     # Previous dialog box widget values will reset when overlayed
 84 |     if st.session_state.get("partner_tool", None):
 85 |         st.session_state["selected_partner"] = st.session_state["partner_tool"]
 86 | 
 87 |     if st.session_state["partner_tool"] == PartnerTool.DBT_SEMANTIC_MODEL.value:
 88 |         upload_dbt_semantic()
 89 |     if st.session_state["partner_tool"] == PartnerTool.LOOKER_EXPLORE.value:
 90 |         from partner.looker import set_looker_semantic
 91 | 
 92 |         set_looker_semantic()
 93 |     if st.session_state["partner_tool"] == PartnerTool.DBT_SQL_MODEL.value:
 94 |         st.session_state["partner_setup"] = False
 95 | 
 96 | 
 97 | class PartnerCompareRow:
 98 |     """
 99 |     Renders matched and unmatched cortex and partner fields for comparison.
100 |     """
101 | 
102 |     def __init__(self, row_data: pd.Series) -> None:  # type: ignore
103 |         self.row_data = row_data
104 |         self.key = row_data["field_key"]
105 |         self.cortex_metadata = (
106 |             self.row_data["field_details_cortex"]
107 |             if self.row_data["field_details_cortex"]
108 |             else {}
109 |         )
110 |         self.partner_metadata = (
111 |             self.row_data["field_details_partner"]
112 |             if self.row_data["field_details_partner"]
113 |             else {}
114 |         )
115 | 
116 |     def render_row(self) -> Union[None, dict[str, Any]]:  # type: ignore
117 |         toggle_options = ["merged", "cortex", "partner", "remove"]
118 |         metadata = {}
119 | 
120 |         # Create metadata based for each field given merging or singular semantic file useage of the field
121 |         # Merge will merge the 2 based on user-selected preference
122 |         if self.cortex_metadata and self.partner_metadata:
123 |             metadata["merged"] = self.cortex_metadata.copy()
124 |             if st.session_state["partner_metadata_preference"] == "Partner":
125 |                 metadata["merged"] = {
126 |                     k: v for k, v in self.cortex_metadata.items() if v
127 |                 } | {k: v for k, v in self.partner_metadata.items() if v}
128 |             else:
129 |                 metadata["merged"] = {
130 |                     k: v for k, v in self.partner_metadata.items() if v
131 |                 } | {k: v for k, v in self.cortex_metadata.items() if v}
132 | 
133 |         else:
134 |             metadata["merged"] = {}
135 |         metadata["partner"] = self.partner_metadata if self.partner_metadata else {}
136 |         metadata["cortex"] = self.cortex_metadata if self.cortex_metadata else {}
137 |         metadata["remove"] = {}
138 | 
139 |         if metadata["merged"]:
140 |             toggle_default = "merged"
141 |         elif metadata["partner"]:
142 |             if st.session_state["keep_extra_partner"]:
143 |                 toggle_default = "partner"
144 |             else:
145 |                 toggle_default = "remove"
146 |         elif metadata["cortex"]:
147 |             if st.session_state["keep_extra_cortex"]:
148 |                 toggle_default = "cortex"
149 |             else:
150 |                 toggle_default = "remove"
151 |         else:
152 |             toggle_default = "remove"
153 | 
154 |         key_col, detail_col = st.columns((0.5, 1))
155 |         with key_col:
156 |             st.write(self.key)
157 |             # We want to disable non-options but always keep remove option
158 |             revised_options = [
159 |                 i for i in toggle_options if metadata[i] or i == "remove"
160 |             ]
161 |             detail_selection: str = st.radio(
162 |                 "Keep",  # type: ignore
163 |                 index=revised_options.index(toggle_default),
164 |                 options=revised_options,
165 |                 key=f"row_{self.key}",
166 |                 format_func=lambda x: x.capitalize(),
167 |                 label_visibility="collapsed",
168 |             )
169 |         with detail_col:
170 |             if metadata[detail_selection]:
171 |                 # Only printing string valued keys for now
172 |                 st.json(
173 |                     {
174 |                         k: v
175 |                         for k, v in metadata[detail_selection].items()
176 |                         if isinstance(v, str)
177 |                     }
178 |                 )
179 |             else:
180 |                 st.write("NA")
181 |         st.divider()
182 |         # Extract the selected metadata if not set to remove
183 |         if detail_selection != "remove":
184 |             selected_metadata: dict[str, Any] = metadata[detail_selection]
185 |             # Add expr to selected metadata if it's not included which is the case for dbt
186 |             selected_metadata["expr"] = self.key
187 |             return selected_metadata
188 | 
189 | 
190 | def compare_sections(section_cortex: str, section_partner: str) -> str:
191 |     """
192 |     Compares section_cortex and section_parnter returning the former if available.
193 |     Otherwise, returns the latter.
194 | 
195 |     Args:
196 |         section_cortex (str): The Cortex section of the Cortex field if found.
197 |         section_cortex (str): The Cortex section of the Partner field if found.
198 | 
199 |     Returns:
200 |         str: Cortex section name.
201 |     """
202 | 
203 |     if section_cortex:
204 |         return section_cortex
205 |     else:
206 |         return section_partner
207 | 
208 | 
209 | def compare_data_types(
210 |     details_cortex: dict[str, Any], details_partner: dict[str, Any]
211 | ) -> Any:
212 |     """
213 |     Returns intended cortex datatype comparing cortex and partner datatype values.
214 | 
215 |     Args:
216 |         details_cortex (dict[str, Any]): Dictionary of Cortex field metadata.
217 |         details_partner (dict[str, Any]): Dictionary of Parnter's Cortex field metadata.
218 | 
219 |     Returns:
220 |         str: Cortex data_type.
221 |     """
222 | 
223 |     cortex_data_type = None
224 |     partner_data_type = None
225 | 
226 |     if isinstance(details_cortex, dict):
227 |         cortex_data_type = details_cortex.get("data_type", None)
228 |     if isinstance(details_partner, dict):
229 |         partner_data_type = details_partner.get("data_type", None)
230 | 
231 |     if cortex_data_type:
232 |         return cortex_data_type
233 |     elif partner_data_type:
234 |         return partner_data_type
235 |     else:
236 |         return "TEXT"
237 | 
238 | 
239 | @st.experimental_dialog("Integrate partner tool semantic specs", width="large")
240 | def integrate_partner_semantics() -> None:
241 |     """
242 |     Runs UI module for comparing Cortex and Partner fields for integration.
243 | 
244 |     Returns:
245 |         None
246 |     """
247 | 
248 |     st.write(
249 |         "Specify how to merge semantic metadata from your selected partner tool with Cortex Analyst's semantic model."
250 |     )
251 | 
252 |     st.write(f"Partner: **{st.session_state.get('selected_partner', None)}**")
253 | 
254 |     COMPARE_SEMANTICS_HELP = """Which semantic file should be checked first for necessary metadata.
255 |     Where metadata is missing, the other semantic file will be checked."""
256 | 
257 |     INTEGRATE_HELP = (
258 |         """Merge the selected Snowflake and Partner tables' semantics together."""
259 |     )
260 | 
261 |     SAVE_HELP = """Save the merges to the Cortex Analyst semantic model for validation and iteration."""
262 | 
263 |     KEEP_CORTEX_HELP = """Retain fields that are found in Cortex Analyst semantic model
264 |     but not in Partner semantic model."""
265 | 
266 |     KEEP_PARTNER_HELP = """Retain fields that are found in Partner semantic model
267 |     but not in Cortex Analyst semantic model."""
268 | 
269 |     if st.session_state.get("partner_setup", False):
270 |         # Execute pre-processing behind the scenes based on vendor tool
271 |         CortexSemanticTable.create_cortex_table_list()
272 | 
273 |         if (
274 |             st.session_state.get("selected_partner", None)
275 |             == PartnerTool.LOOKER_EXPLORE.value
276 |         ):
277 |             from partner.looker import LookerSemanticTable
278 | 
279 |             LookerSemanticTable.create_cortex_table_list()
280 |         elif (
281 |             st.session_state.get("selected_partner", None)
282 |             == PartnerTool.DBT_SEMANTIC_MODEL.value
283 |         ):
284 |             pass
285 |         else:
286 |             st.error("Selected partner tool not available.")
287 | 
288 |         # Create table selections for comparison
289 |         partner_tables = [
290 |             model.get_name() for model in st.session_state["partner_semantic"]
291 |         ]
292 |         cortex_tables = [
293 |             table.get_name() for table in st.session_state["cortex_comparison_tables"]
294 |         ]
295 | 
296 |         st.write("Select which logical tables/views to compare and merge.")
297 |         c1, c2 = st.columns(2)
298 |         with c1:
299 |             semantic_cortex_tbl: str = st.selectbox("Snowflake", cortex_tables)  # type: ignore
300 |         with c2:
301 |             semantic_partner_tbl: str = st.selectbox("Partner", partner_tables)  # type: ignore
302 | 
303 |         st.session_state["partner_metadata_preference"] = st.selectbox(
304 |             "For fields shared in both sources, which source should be checked first for common metadata?",
305 |             ["Partner", "Cortex"],
306 |             index=0,
307 |             help=COMPARE_SEMANTICS_HELP,
308 |         )
309 |         orphan_label, orphan_col1, orphan_col2 = st.columns(3, gap="small")
310 |         with orphan_label:
311 |             st.write("Retain unmatched fields:")
312 |         with orphan_col1:
313 |             st.session_state["keep_extra_cortex"] = st.toggle(
314 |                 "Cortex", value=True, help=KEEP_CORTEX_HELP
315 |             )
316 |         with orphan_col2:
317 |             st.session_state["keep_extra_partner"] = st.toggle(
318 |                 "Partner", value=True, help=KEEP_PARTNER_HELP
319 |             )
320 |         with st.expander("Advanced configuration", expanded=False):
321 |             # Create dataframe of each semantic file's fields with mergeable keys
322 |             st.caption("Only shared metadata information displayed")
323 |             cortex_fields_df = CortexSemanticTable.retrieve_df_by_name(
324 |                 semantic_cortex_tbl
325 |             )
326 | 
327 |             if (
328 |                 st.session_state.get("selected_partner", None)
329 |                 == PartnerTool.LOOKER_EXPLORE.value
330 |             ):
331 |                 from partner.looker import LookerSemanticTable
332 | 
333 |                 partner_fields_df = LookerSemanticTable.retrieve_df_by_name(
334 |                     semantic_partner_tbl
335 |                 )
336 |             if (
337 |                 st.session_state.get("selected_partner", None)
338 |                 == PartnerTool.DBT_SEMANTIC_MODEL.value
339 |             ):
340 |                 partner_fields_df = DBTSemanticModel.retrieve_df_by_name(
341 |                     semantic_partner_tbl
342 |                 )
343 | 
344 |             combined_fields_df = cortex_fields_df.merge(
345 |                 partner_fields_df,
346 |                 on="field_key",
347 |                 how="outer",
348 |                 suffixes=("_cortex", "_partner"),
349 |             ).replace(
350 |                 np.nan, None
351 |             )  # Will be comparing values to None in UI logic
352 | 
353 |             # Convert json strings to dict for easier extraction later
354 |             for col in ["field_details_cortex", "field_details_partner"]:
355 |                 combined_fields_df[col] = combined_fields_df[col].apply(
356 |                     lambda x: (
357 |                         json.loads(x)
358 |                         if not pd.isnull(x) and not isinstance(x, dict)
359 |                         else x
360 |                     )
361 |                 )
362 | 
363 |             # Create containers and store them in a dictionary
364 |             containers = {
365 |                 "dimensions": st.container(),
366 |                 "measures": st.container(),
367 |                 "time_dimensions": st.container(),
368 |             }
369 | 
370 |             # Assign labels to the containers
371 |             for key in containers.keys():
372 |                 containers[key].write(f"**{key.replace('_', ' ').title()}**")
373 | 
374 |             # Initialize sections as empty lists
375 |             sections: dict[str, list[dict[str, Any]]] = {
376 |                 key: [] for key in containers.keys()
377 |             }
378 |             for k, v in combined_fields_df.iterrows():
379 |                 # Get destination section and intended data type for cortex analyst semantic file
380 |                 # If the key is found from the generator, use it. Otherwise, use the partner-specific logic.
381 |                 target_section = compare_sections(
382 |                     v["section_cortex"], v["section_partner"]
383 |                 )
384 |                 target_data_type = compare_data_types(
385 |                     v["field_details_cortex"], v["field_details_partner"]
386 |                 )
387 |                 with containers[target_section]:
388 |                     selected_metadata = PartnerCompareRow(v).render_row()
389 |                     if selected_metadata:
390 |                         selected_metadata["data_type"] = target_data_type
391 |                         sections[target_section].append(selected_metadata)
392 | 
393 |         integrate_col, commit_col, _ = st.columns((1, 1, 5), gap="small")
394 |         with integrate_col:
395 |             merge_button = st.button(
396 |                 "Merge", help=INTEGRATE_HELP, use_container_width=True
397 |             )
398 |         with commit_col:
399 |             reset_button = st.button(
400 |                 "Save",
401 |                 help=SAVE_HELP,
402 |                 use_container_width=True,
403 |             )
404 | 
405 |         if merge_button:
406 |             set_sit_query_tag(
407 |                 get_snowflake_connection(),
408 |                 vendor=st.session_state["selected_partner"],
409 |                 action="merge",
410 |             )
411 |             # Update fields in cortex semantic model
412 |             for i, tbl in enumerate(st.session_state["cortex_comparison_tables"]):
413 |                 if tbl.get_name() == semantic_cortex_tbl:
414 |                     for k in sections.keys():
415 |                         st.session_state["current_yaml_as_dict"]["tables"][i][k] = (
416 |                             sections[k]
417 |                         )
418 | 
419 |             try:
420 |                 st.session_state["yaml"] = yaml.dump(
421 |                     st.session_state["current_yaml_as_dict"], sort_keys=False
422 |                 )
423 |                 st.session_state["semantic_model"] = yaml_to_semantic_model(
424 |                     st.session_state["yaml"]
425 |                 )
426 |                 merge_msg = st.success("Merging...")
427 |                 time.sleep(1)
428 |                 merge_msg.empty()
429 |             except Exception as e:
430 |                 st.error(f"Integration failed: {e}")
431 | 
432 |         if reset_button:
433 |             set_sit_query_tag(
434 |                 get_snowflake_connection(),
435 |                 vendor=st.session_state["selected_partner"],
436 |                 action="integration_complete",
437 |             )
438 |             st.success(
439 |                 "Integration complete! Please validate your semantic model before uploading."
440 |             )
441 |             time.sleep(1.5)
442 |             st.rerun()  # Lazy alternative to resetting all configurations
443 |     else:
444 |         st.error("Partner semantic not setup.")
445 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [tool.poetry]
 2 | name = "semantic-model-generator"
 3 | version = "1.0.0"
 4 | description = "Curate a Semantic Model for Snowflake Cortex Analyst"
 5 | authors = ["Jonathan Hilgart <jonathan.hilgart@snowflake.com>", "Nipun Sehrawat <nipun.sehrawat@snowflake.com>", "Renee Huang <renee.huang@snowflake.com>", "Nicole Limtiaco <nicole.limtiaco@snowflake.com>"]
 6 | license = "Apache Software License; BSD License"
 7 | readme = "README.md"
 8 | 
 9 | [tool.poetry.dependencies]
10 | python = ">=3.9,<3.9.7 || >3.9.7,<3.12"
11 | pandas = "^2.0.1"
12 | loguru = "^0.7.2"
13 | snowflake-connector-python = { extras = ["secure-local-storage", "pandas"], version = "^3.11.0" }
14 | protobuf = "5.26.1"
15 | pydantic = "2.8.2"
16 | PyYAML = "^6.0.1"
17 | "ruamel.yaml" = "0.17.21"
18 | tqdm = "^4.66.5"
19 | pyarrow = "14.0.2"
20 | sqlglot = "25.10.0"
21 | strictyaml = "^1.7.3"
22 | streamlit = "1.36.0"
23 | streamlit-extras = "0.4.0"
24 | numpy = "^1.26.4"
25 | python-dotenv = "^1.0.1"
26 | urllib3 = "^1.26.19"
27 | requests = "^2.32.3"
28 | snowflake-snowpark-python = "1.18.0"
29 | 
30 | # Optional dependencies for functionality such as partner semantic model support.
31 | looker-sdk = { version = "^24.14.0", optional = true }
32 | 
33 | [tool.poetry.group.dev.dependencies]
34 | mypy = "^1.9.0"
35 | black = "^24.3.0"
36 | isort = "^5.13.2"
37 | flake8 = "^7.0.0"
38 | pytest = "^8.1.1"
39 | types-pyyaml = "^6.0.12.20240311"
40 | types-protobuf = "^4.24.0.20240311"
41 | pip-licenses = "^4.4.0"
42 | grpcio-tools = "1.64.1"
43 | 
44 | [tool.poetry.extras]
45 | looker = ["looker-sdk"]
46 | 
47 | [build-system]
48 | requires = ["poetry-core"]
49 | build-backend = "poetry.core.masonry.api"
50 | 


--------------------------------------------------------------------------------
/semantic_model_generator/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/semantic_model_generator/__init__.py


--------------------------------------------------------------------------------
/semantic_model_generator/data_processing/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/semantic_model_generator/data_processing/__init__.py


--------------------------------------------------------------------------------
/semantic_model_generator/data_processing/cte_utils.py:
--------------------------------------------------------------------------------
  1 | # TODO: Add tests for quoted columns, which are not well tested today.
  2 | 
  3 | import copy
  4 | from typing import List, Optional
  5 | 
  6 | import sqlglot
  7 | import sqlglot.expressions
  8 | from loguru import logger
  9 | from sqlglot.dialects.snowflake import Snowflake
 10 | 
 11 | from semantic_model_generator.protos import semantic_model_pb2
 12 | from semantic_model_generator.snowflake_utils.snowflake_connector import (
 13 |     OBJECT_DATATYPES,
 14 | )
 15 | 
 16 | _LOGICAL_TABLE_PREFIX = "__"
 17 | 
 18 | 
 19 | def is_logical_table(table_name: str) -> bool:
 20 |     """Returns true if 'table_name' is a logical table name."""
 21 |     return table_name.startswith(_LOGICAL_TABLE_PREFIX) and len(table_name) > len(
 22 |         _LOGICAL_TABLE_PREFIX
 23 |     )
 24 | 
 25 | 
 26 | def logical_table_name(table: semantic_model_pb2.Table) -> str:
 27 |     """Returns the name of logical table for a given table.  E.g. __fact"""
 28 |     return _LOGICAL_TABLE_PREFIX + table.name  # type: ignore[no-any-return]
 29 | 
 30 | 
 31 | def fully_qualified_table_name(table: semantic_model_pb2.FullyQualifiedTable) -> str:
 32 |     """Returns fully qualified table name such as my_db.my_schema.my_table"""
 33 |     fqn = table.table
 34 |     if len(table.schema) > 0:
 35 |         fqn = f"{table.schema}.{fqn}"
 36 |     if len(table.database) > 0:
 37 |         fqn = f"{table.database}.{fqn}"
 38 |     return fqn  # type: ignore[no-any-return]
 39 | 
 40 | 
 41 | def is_aggregation_expr(col: semantic_model_pb2.Column) -> bool:
 42 |     """Check if an expr contains aggregation function.
 43 |     Note: only flag True for aggregations that would changes number of rows of data.
 44 |     For window function, given the operation will produce value per row, mark as False here.
 45 | 
 46 |     Raises:
 47 |         ValueError: if expr is not parsable, or if aggregation expressions in non-measure columns.
 48 |     """
 49 |     parsed = sqlglot.parse_one(col.expr, dialect=Snowflake)
 50 |     agg_func = list(parsed.find_all(sqlglot.expressions.AggFunc))
 51 |     window = list(parsed.find_all(sqlglot.expressions.Window))
 52 |     # We've confirmed window functions cannot appear inside aggregate functions
 53 |     # (gets execution error msg: Window function [SUM(...) OVER (PARTITION BY ...)] may not appear inside an aggregate function).
 54 |     # So if there's a window function present there can't also be an aggregate function applied to the window function.
 55 |     if len(agg_func) > 0 and len(window) == 0:
 56 |         if col.kind != 2:
 57 |             raise ValueError("Only allow aggregation expressions for measures.")
 58 |         return True
 59 |     return False
 60 | 
 61 | 
 62 | def _is_physical_table_column(col: semantic_model_pb2.Column) -> bool:
 63 |     """Returns whether the column refers to a single raw table column."""
 64 |     try:
 65 |         parsed = sqlglot.parse_one(col.expr, dialect=Snowflake)
 66 |         return isinstance(parsed, sqlglot.expressions.Column)
 67 |     except Exception as ex:
 68 |         logger.warning(
 69 |             f"Failed to parse sql expression: {col.expr}. Error: {ex}. {col}"
 70 |         )
 71 |         return False
 72 | 
 73 | 
 74 | def _is_identifier_quoted(col_name: str) -> bool:
 75 |     return '"' in col_name
 76 | 
 77 | 
 78 | def remove_ltable_cte(sql_w_ltable_cte: str, table_names: list[str]) -> str:
 79 |     """
 80 |     Given a SQL with prefix'd logical table conversion CTE(s), remove the logical table conversions.
 81 |     Args:
 82 |         sql_w_ltable_cte: the sql with logical table conversion CTE(s).
 83 |         table_names: list of tables in the semantic model.
 84 | 
 85 |     Returns: the sql without the logical table conversion CTE.
 86 |     Raises: ValueError if didn't find any CTE or parsed first CTE is not logical table CTE.
 87 |     """
 88 |     ast = sqlglot.parse_one(sql_w_ltable_cte, read=Snowflake)
 89 |     with_ = ast.args.get("with")
 90 |     if with_ is None:
 91 |         raise ValueError("Analyst queries must contain the logical CTE.")
 92 |     if not is_logical_table(with_.expressions[0].alias):
 93 |         raise ValueError("Analyst queries must contain the logical CTE.")
 94 | 
 95 |     table_names_lower = [table_name.lower() for table_name in table_names]
 96 |     # Iterate through all CTEs, and filter out logical table CTEs.
 97 |     # This is done by checking if the CTE alias starts with the logical table prefix and if the alias is in a table in the semantic model.
 98 |     non_logical_cte = [
 99 |         cte
100 |         for cte in with_.expressions
101 |         if not is_logical_table(cte.alias)
102 |         or cte.alias.replace(_LOGICAL_TABLE_PREFIX, "").lower() not in table_names_lower
103 |     ]
104 | 
105 |     # Replace the original expressions list with the filtered list
106 |     with_.set("expressions", non_logical_cte)
107 | 
108 |     # If no expressions are left for whatever reason, remove the entire WITH clause.
109 |     if not with_.expressions:
110 |         ast.set("with", None)
111 | 
112 |     sql_without_logical_cte = ast.sql(dialect=Snowflake, pretty=True)
113 |     return sql_without_logical_cte  # type: ignore [no-any-return]
114 | 
115 | 
116 | def _validate_col(column: semantic_model_pb2.Column) -> None:
117 |     if " " in column.name.strip():
118 |         raise ValueError(
119 |             f"Please do not include spaces in your column name: {column.name}"
120 |         )
121 |     if column.data_type.upper() in OBJECT_DATATYPES:
122 |         raise ValueError(
123 |             f"We do not support object datatypes in the semantic model. Col {column.name} has data type {column.data_type}. Please remove this column from your semantic model or flatten it to non-object type."
124 |         )
125 | 
126 | 
127 | def validate_all_cols(table: semantic_model_pb2.Table) -> None:
128 |     for column in table.columns:
129 |         _validate_col(column)
130 | 
131 | 
132 | def _get_col_expr(column: semantic_model_pb2.Column) -> str:
133 |     """Return column expr in SQL format.
134 |     Raise errors if columns is of OBJECT_DATATYPES, which we do not support today."""
135 |     return (
136 |         f"{column.expr.strip()} as {column.name.strip()}"
137 |         if column.expr.strip().lower() != column.name.strip().lower()
138 |         else f"{column.expr.strip()}"
139 |     )
140 | 
141 | 
142 | def _generate_cte_for(
143 |     table: semantic_model_pb2.Table, columns: List[semantic_model_pb2.Column]
144 | ) -> str:
145 |     """
146 |     Returns a CTE representing a logical table that selects 'col' columns from 'table'.
147 |     """
148 | 
149 |     if len(columns) == 0:
150 |         raise ValueError("Please include at least one column to generate CTE on.")
151 |     else:
152 |         expr_columns = [_get_col_expr(col) for col in columns]
153 |         cte = f"WITH {logical_table_name(table)} AS (\n"
154 |         cte += "SELECT \n"
155 |         cte += ",\n".join(expr_columns) + "\n"
156 |         cte += f"FROM {fully_qualified_table_name(table.base_table)}"
157 |         cte += ")"
158 |         return cte
159 | 
160 | 
161 | def get_all_physical_column_references(
162 |     column: semantic_model_pb2.Column,
163 | ) -> List[str]:
164 |     """Returns a set of column names referenced in the column expression.
165 | 
166 |     For example, the following column expressions yield the following return values:
167 |     foo -> [foo]
168 |     foo+bar -> [foo, bar]
169 |     sum(foo) -> [foo]
170 |     """
171 |     try:
172 |         parsed = sqlglot.parse_one(column.expr, dialect=Snowflake)
173 |         col_names = set()
174 |         for col in parsed.find_all(sqlglot.expressions.Column):
175 |             # TODO(renee): Handle quoted columns.
176 |             col_name = col.name.lower()
177 |             if col.this.quoted:
178 |                 col_name = col.name
179 |             col_names.add(col_name)
180 |         return sorted(list(col_names))
181 |     except Exception as ex:
182 |         raise ValueError(f"Failed to parse sql expression: {column.expr}. Error: {ex}")
183 | 
184 | 
185 | def direct_mapping_logical_columns(
186 |     table: semantic_model_pb2.Table,
187 | ) -> List[semantic_model_pb2.Column]:
188 |     """
189 |     Returns a list of logical columns that map 1:1 to an underlying physical column
190 |     (i.e. logical table's expression is simply the physical column name) in this table.
191 |     """
192 |     ret: List[semantic_model_pb2.Column] = []
193 |     for c in table.columns:
194 |         if _is_physical_table_column(c):
195 |             ret.append(c)
196 |     return ret
197 | 
198 | 
199 | def _enrich_column_in_expr_with_aggregation(
200 |     table: semantic_model_pb2.Table,
201 | ) -> semantic_model_pb2.Table:
202 |     """
203 |     Expands the logical columns of 'table' to include columns mentioned in a logical columns
204 |     with an aggregate expression. E.g. for a logical column called CPC with expr sum(cost) / sum(clicks),
205 |     adds logical columns for "cost" and "clicks", if not present.
206 |     """
207 |     direct_mapping_lcols = [
208 |         c.name.lower() for c in direct_mapping_logical_columns(table)
209 |     ]
210 |     cols_to_append = set()
211 |     for col in table.columns:
212 |         if not is_aggregation_expr(col):
213 |             continue
214 |         for pcol in get_all_physical_column_references(col):
215 |             # If the physical column doesn't have a direct mapping logical column
216 |             # with the same name, then we need to add a new logical column for it.
217 |             # Note that this may introduce multiple logical columns directly referencing
218 |             # the same physical column, something we should improve up, perhaps by
219 |             # rewriting the expression to use existing direct mapping logical columns
220 |             # whenever preset.
221 |             if pcol not in direct_mapping_lcols:
222 |                 cols_to_append.add(pcol)
223 | 
224 |     original_cols = {col.name.lower(): col.expr for col in table.columns}
225 |     ret = copy.deepcopy(table)
226 |     # Insert in sorted order to make this method deterministic.
227 |     for c in sorted(cols_to_append):
228 |         if c in original_cols:
229 |             logger.warning(
230 |                 f"Not adding a logical column for physical column {c} in table {table.name}, "
231 |                 f"since this logical column already exists with expression {original_cols[c]}"
232 |             )
233 |         else:
234 |             new_col = semantic_model_pb2.Column(name=c, expr=c)
235 |             ret.columns.append(new_col)
236 |     return ret
237 | 
238 | 
239 | def _generate_non_agg_cte(table: semantic_model_pb2.Table) -> Optional[str]:
240 |     """
241 |     Returns a CTE representing a logical table that selects 'col' columns from 'table' except for aggregation columns.
242 |     """
243 |     filtered_cols = [col for col in table.columns if not is_aggregation_expr(col)]
244 |     if len(filtered_cols) > 0:
245 |         return _generate_cte_for(table, filtered_cols)
246 |     else:
247 |         return None
248 | 
249 | 
250 | def _convert_to_snowflake_sql(sql: str) -> str:
251 |     """
252 |     Converts a given SQL statement to Snowflake SQL syntax using SQLGlot.
253 | 
254 |     Args:
255 |     sql (str): The SQL statement to convert.
256 | 
257 |     Returns:
258 |     str: The SQL statement in Snowflake syntax.
259 |     """
260 |     try:
261 |         expression = sqlglot.parse_one(sql, dialect=Snowflake)
262 |     except Exception as e:
263 |         raise ValueError(
264 |             f"Unable to parse sql statement.\n Provided sql: {sql}\n. Error: {e}"
265 |         )
266 | 
267 |     return expression.sql()
268 | 
269 | 
270 | def generate_select(
271 |     table_in_column_format: semantic_model_pb2.Table, limit: int
272 | ) -> List[str]:
273 |     """Generate select query for all columns for validation purpose."""
274 |     sqls_to_return: List[str] = []
275 |     # Generate select query for columns without aggregation exprs.
276 |     non_agg_cte = _generate_non_agg_cte(table_in_column_format)
277 |     if non_agg_cte is not None:
278 |         non_agg_sql = (
279 |             non_agg_cte
280 |             + f"SELECT * FROM {logical_table_name(table_in_column_format)} LIMIT {limit}"
281 |         )
282 |         sqls_to_return.append(_convert_to_snowflake_sql(non_agg_sql))
283 | 
284 |     # Generate select query for columns with aggregation exprs.
285 |     agg_cols = [
286 |         col for col in table_in_column_format.columns if is_aggregation_expr(col)
287 |     ]
288 |     if len(agg_cols) == 0:
289 |         return sqls_to_return
290 |     else:
291 |         agg_cte = _generate_cte_for(table_in_column_format, agg_cols)
292 |         agg_sql = (
293 |             agg_cte
294 |             + f"SELECT * FROM {logical_table_name(table_in_column_format)} LIMIT {limit}"
295 |         )
296 |         sqls_to_return.append(_convert_to_snowflake_sql(agg_sql))
297 |     return sqls_to_return
298 | 
299 | 
300 | def expand_all_logical_tables_as_ctes(
301 |     sql_query: str, model_in_column_format: semantic_model_pb2.SemanticModel
302 | ) -> str:
303 |     """
304 |     Returns a SQL query that expands all logical tables contained in ctx as ctes.
305 |     """
306 | 
307 |     def generate_full_logical_table_ctes(
308 |         ctx: semantic_model_pb2.SemanticModel,
309 |     ) -> List[str]:
310 |         """
311 |         Given an arbitrary SQL, returns a list of CTEs representing all the logical tables
312 |         referenced in it.
313 |         """
314 |         ctes: List[str] = []
315 |         for table in ctx.tables:
316 |             # Append all columns and expressions for the logical table.
317 |             # If table contains expr with aggregations, enrich its referred columns into the table.
318 |             table_ = _enrich_column_in_expr_with_aggregation(table)
319 |             cte = _generate_non_agg_cte(table_)
320 |             if cte is not None:
321 |                 ctes.append(cte)
322 |         return ctes
323 | 
324 |     # Step 1: Generate a CTE for each logical table referenced in the query.
325 |     ctes = generate_full_logical_table_ctes(model_in_column_format)
326 | 
327 |     # Step 2: Parse each generated CTE as a 'WITH' clause.
328 |     new_withs = []
329 |     for cte in ctes:
330 |         new_withs.append(
331 |             sqlglot.parse_one(cte, read=Snowflake, into=sqlglot.expressions.With)
332 |         )
333 | 
334 |     # Step 3: Prefix the CTEs to the original query.
335 |     ast = sqlglot.parse_one(sql_query, read=Snowflake)
336 |     with_ = ast.args.get("with")
337 |     # If the query doesn't have a WITH clause, then generate one.
338 |     if with_ is None:
339 |         merged_with = new_withs[0]
340 |         remaining_ctes = [w.expressions[0] for w in new_withs[1:]]
341 |         merged_with.set("expressions", merged_with.expressions + remaining_ctes)
342 |         ast.set("with", merged_with)
343 |     # If the query already has a WITH clause, prefix the CTEs to it.
344 |     else:
345 |         new_ctes = [w.expressions[0] for w in new_withs]
346 |         with_.set("expressions", new_ctes + with_.expressions)
347 |     return ast.sql(dialect=Snowflake, pretty=True)  # type: ignore [no-any-return]
348 | 
349 | 
350 | def context_to_column_format(
351 |     ctx: semantic_model_pb2.SemanticModel,
352 | ) -> semantic_model_pb2.SemanticModel:
353 |     """
354 |     Converts semantic_model_pb2.SemanticModel from a dimension/measure format to a column format.
355 |     Returns a new semantic_model_pb2.SemanticModel object that's in column format.
356 |     """
357 |     ret = semantic_model_pb2.SemanticModel()
358 |     ret.CopyFrom(ctx)
359 |     for table in ret.tables:
360 |         column_format = len(table.columns) > 0
361 |         dimension_measure_format = (
362 |             len(table.dimensions) > 0
363 |             or len(table.time_dimensions) > 0
364 |             or len(table.measures) > 0
365 |         )
366 |         if column_format and dimension_measure_format:
367 |             raise ValueError(
368 |                 "table {table.name} defines both columns and dimensions/time_dimensions/measures."
369 |             )
370 |         if column_format:
371 |             continue
372 |         for d in table.dimensions:
373 |             col = semantic_model_pb2.Column()
374 |             col.kind = semantic_model_pb2.ColumnKind.dimension
375 |             col.name = d.name
376 |             col.synonyms.extend(d.synonyms)
377 |             col.description = d.description
378 |             col.expr = d.expr
379 |             col.data_type = d.data_type
380 |             col.unique = d.unique
381 |             col.sample_values.extend(d.sample_values)
382 |             table.columns.append(col)
383 |         del table.dimensions[:]
384 | 
385 |         for td in table.time_dimensions:
386 |             col = semantic_model_pb2.Column()
387 |             col.kind = semantic_model_pb2.ColumnKind.time_dimension
388 |             col.name = td.name
389 |             col.synonyms.extend(td.synonyms)
390 |             col.description = td.description
391 |             col.expr = td.expr
392 |             col.data_type = td.data_type
393 |             col.unique = td.unique
394 |             col.sample_values.extend(td.sample_values)
395 |             table.columns.append(col)
396 |         del table.time_dimensions[:]
397 | 
398 |         for m in table.measures:
399 |             col = semantic_model_pb2.Column()
400 |             col.kind = semantic_model_pb2.ColumnKind.measure
401 |             col.name = m.name
402 |             col.synonyms.extend(m.synonyms)
403 |             col.description = m.description
404 |             col.expr = m.expr
405 |             col.data_type = m.data_type
406 |             col.default_aggregation = m.default_aggregation
407 |             col.sample_values.extend(m.sample_values)
408 |             table.columns.append(col)
409 |         del table.measures[:]
410 |     return ret
411 | 


--------------------------------------------------------------------------------
/semantic_model_generator/data_processing/cte_utils_test.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | import pytest
 4 | 
 5 | from semantic_model_generator.data_processing.cte_utils import remove_ltable_cte
 6 | 
 7 | 
 8 | class TestRemoveLogicalTableCTE:
 9 |     def test_removes_logical_table_cte(self) -> None:
10 |         """
11 |         Testing that we remove logical table CTEs corresponding to existing table names.
12 |         """
13 |         query = "WITH __logical_table AS (SELECT * FROM table1) SELECT * FROM __logical_table"
14 |         table_names = ["LOGICAL_TABLE"]
15 |         expected_query = "SELECT * FROM __logical_table"
16 | 
17 |         actual_output = remove_ltable_cte(query, table_names=table_names)
18 |         actual_output = re.sub(r"\s+", " ", actual_output)
19 | 
20 |         assert actual_output == expected_query
21 | 
22 |     def test_does_not_remove_non_logical_cte(self) -> None:
23 |         """
24 |         Testing that CTEs not mapping to existing table names are not removed.
25 |         """
26 |         query = (
27 |             "WITH __other_table AS (SELECT * FROM table1) SELECT * FROM __other_table"
28 |         )
29 |         table_names = ["LOGICAL_TABLE"]
30 |         expected_query = (
31 |             "WITH __other_table AS ( SELECT * FROM table1 ) SELECT * FROM __other_table"
32 |         )
33 | 
34 |         actual_output = remove_ltable_cte(query, table_names=table_names)
35 |         actual_output = re.sub(r"\s+", " ", actual_output)
36 | 
37 |         assert actual_output == expected_query
38 | 
39 |     def test_mixed_ctes(self) -> None:
40 |         """
41 |         Given a query containing a mixture of CTEs, only the logical table CTEs should be removed.
42 |         """
43 |         query = "WITH __logical_table AS (SELECT * FROM table1), __other_table AS (SELECT * FROM table2), __custom_table AS (SELECT * FROM table3) SELECT * FROM __logical_table"
44 |         table_names = ["LOGICAL_TABLE"]
45 |         expected_query = "WITH __other_table AS ( SELECT * FROM table2 ), __custom_table AS ( SELECT * FROM table3 ) SELECT * FROM __logical_table"
46 | 
47 |         actual_output = remove_ltable_cte(query, table_names=table_names)
48 |         actual_output = re.sub(r"\s+", " ", actual_output)
49 | 
50 |         assert actual_output == expected_query
51 | 
52 |     def test_throws_value_error_without_cte(self) -> None:
53 |         """
54 |         Testing that an error is thrown if there is no CTE in the query.
55 |         """
56 |         query = "SELECT * FROM table1"
57 |         table_names = ["LOGICAL_TABLE"]
58 | 
59 |         with pytest.raises(ValueError):
60 |             remove_ltable_cte(query, table_names=table_names)
61 | 
62 |     def test_throws_value_error_if_first_cte_not_logical_table(self) -> None:
63 |         """
64 |         Testing that an error is thrown if the first CTE is not a logical table.
65 |         """
66 |         query = "WITH random_alias AS (SELECT * FROM table1), __logical_table AS (SELECT * FROM table2) SELECT * FROM __logical_table"
67 |         table_names = ["LOGICAL_TABLE"]
68 | 
69 |         with pytest.raises(ValueError):
70 |             remove_ltable_cte(query, table_names=table_names)
71 | 


--------------------------------------------------------------------------------
/semantic_model_generator/data_processing/data_types.py:
--------------------------------------------------------------------------------
 1 | from typing import Any, List, Optional
 2 | 
 3 | from pydantic.dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class FQNParts:
 8 |     database: str
 9 |     schema_name: str
10 |     table: str
11 | 
12 |     def __post_init__(self: Any) -> None:
13 |         """Uppercase table name"""
14 |         self.table = self.table.upper()
15 | 
16 | 
17 | @dataclass
18 | class Column:
19 |     id_: int
20 |     column_name: str
21 |     column_type: str
22 |     values: Optional[List[str]] = None
23 |     comment: Optional[str] = (
24 |         None  # comment field's to save the column comment user specified on the column
25 |     )
26 | 
27 |     def __post_init__(self: Any) -> None:
28 |         """
29 |         Update column_type to cleaned up version, eg. NUMBER(38,0) -> NUMBER
30 |         """
31 | 
32 |         self.column_type = self.column_type.split("(")[0].strip().upper()
33 | 
34 | 
35 | @dataclass
36 | class Table:
37 |     id_: int
38 |     name: str
39 |     columns: List[Column]
40 |     comment: Optional[str] = (
41 |         None  # comment field's to save the table comment user specified on the table
42 |     )
43 | 
44 |     def __post_init__(self: Any) -> None:
45 |         for col in self.columns:
46 |             if col.column_name == "":
47 |                 raise ValueError("column name in table must be nonempty")
48 | 


--------------------------------------------------------------------------------
/semantic_model_generator/data_processing/proto_utils.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import json
 3 | from typing import Any, TypeVar
 4 | 
 5 | import ruamel.yaml
 6 | from google.protobuf import json_format
 7 | from google.protobuf.message import Message
 8 | from strictyaml import dirty_load
 9 | 
10 | from semantic_model_generator.protos import semantic_model_pb2
11 | from semantic_model_generator.validate.schema import SCHEMA
12 | 
13 | ProtoMsg = TypeVar("ProtoMsg", bound=Message)
14 | 
15 | 
16 | def proto_to_yaml(message: ProtoMsg) -> str:
17 |     """Serializes the input proto into a yaml message.
18 | 
19 |     Args:
20 |         message: Protobuf message to be serialized.
21 | 
22 |     Returns:
23 |         The serialized yaml string, or None if an error occurs.
24 |     """
25 |     try:
26 |         json_data = json.loads(
27 |             json_format.MessageToJson(message, preserving_proto_field_name=True)
28 |         )
29 | 
30 |         # Using ruamel.yaml package to preserve message order.
31 |         yaml = ruamel.yaml.YAML()
32 |         yaml.indent(mapping=2, sequence=4, offset=2)
33 |         yaml.preserve_quotes = True
34 | 
35 |         with io.StringIO() as stream:
36 |             yaml.dump(json_data, stream)
37 |             yaml_str = stream.getvalue()
38 |         assert isinstance(yaml_str, str)
39 |         return yaml_str
40 |     except Exception as e:
41 |         raise ValueError(f"Failed to convert protobuf message to YAML: {e}")
42 | 
43 | 
44 | def proto_to_dict(message: ProtoMsg) -> dict[str, Any]:
45 |     """Serializes the input proto into a dictionary.
46 | 
47 |     Args:
48 |         message: Protobuf message to be serialized.
49 | 
50 |     Returns:
51 |         The serialized dictionary, or None if an error occurs.
52 |     """
53 |     try:
54 |         # Convert the Protobuf message to JSON string.
55 |         json_str = json_format.MessageToJson(message, preserving_proto_field_name=True)
56 | 
57 |         # Convert the JSON string to a Python dictionary.
58 |         json_data = json.loads(json_str)
59 | 
60 |         assert isinstance(json_data, dict)
61 |         return json_data
62 |     except Exception as e:
63 |         raise ValueError(f"Failed to convert protobuf message to dictionary: {e}")
64 | 
65 | 
66 | def yaml_to_semantic_model(yaml_str: str) -> semantic_model_pb2.SemanticModel:
67 |     """
68 |     Deserializes the input yaml into a SemanticModel Protobuf message. The
69 |     input yaml must be fully representable as json, so yaml features like
70 |     custom types and block scalars are not supported.
71 | 
72 |     Args:
73 |         yaml_str: Path to the YAML file.
74 | 
75 |     Returns:
76 |         The deserialized SemanticModel protobuf message
77 |     """
78 | 
79 |     # strictyaml is very opinionated on the style of yaml, and rejects yamls that use flow style (e.g. lists with []
80 |     # or maps with {}). See https://hitchdev.com/strictyaml/why/flow-style-removed/. This is purely a style preference
81 |     # and those yamls are still parsable. To allow such yamls, we use dirty_load here, which behaves exactly as the
82 |     # load method but allows flow style.
83 |     parsed_yaml = dirty_load(
84 |         yaml_str, SCHEMA, label="semantic model", allow_flow_style=True
85 |     )
86 |     msg = semantic_model_pb2.SemanticModel()
87 |     return json_format.ParseDict(parsed_yaml.data, msg)
88 | 


--------------------------------------------------------------------------------
/semantic_model_generator/generate_model.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from datetime import datetime
  3 | from typing import List, Optional
  4 | 
  5 | from loguru import logger
  6 | from snowflake.connector import SnowflakeConnection
  7 | 
  8 | from semantic_model_generator.data_processing import data_types, proto_utils
  9 | from semantic_model_generator.protos import semantic_model_pb2
 10 | from semantic_model_generator.snowflake_utils.snowflake_connector import (
 11 |     AUTOGEN_TOKEN,
 12 |     DIMENSION_DATATYPES,
 13 |     MEASURE_DATATYPES,
 14 |     OBJECT_DATATYPES,
 15 |     TIME_MEASURE_DATATYPES,
 16 |     get_table_representation,
 17 |     get_valid_schemas_tables_columns_df,
 18 | )
 19 | from semantic_model_generator.snowflake_utils.utils import create_fqn_table
 20 | from semantic_model_generator.validate.context_length import validate_context_length
 21 | 
 22 | _PLACEHOLDER_COMMENT = "  "
 23 | _FILL_OUT_TOKEN = " # <FILL-OUT>"
 24 | # TODO add _AUTO_GEN_TOKEN to the end of the auto generated descriptions.
 25 | _AUTOGEN_COMMENT_TOKEN = (
 26 |     " # <AUTO-GENERATED DESCRIPTION, PLEASE MODIFY AND REMOVE THE __ AT THE END>"
 27 | )
 28 | _DEFAULT_N_SAMPLE_VALUES_PER_COL = 3
 29 | _AUTOGEN_COMMENT_WARNING = f"# NOTE: This file was auto-generated by the semantic model generator. Please fill out placeholders marked with {_FILL_OUT_TOKEN} (or remove if not relevant) and verify autogenerated comments.\n"
 30 | 
 31 | 
 32 | def _get_placeholder_filter() -> List[semantic_model_pb2.NamedFilter]:
 33 |     return [
 34 |         semantic_model_pb2.NamedFilter(
 35 |             name=_PLACEHOLDER_COMMENT,
 36 |             synonyms=[_PLACEHOLDER_COMMENT],
 37 |             description=_PLACEHOLDER_COMMENT,
 38 |             expr=_PLACEHOLDER_COMMENT,
 39 |         )
 40 |     ]
 41 | 
 42 | 
 43 | def _get_placeholder_joins() -> List[semantic_model_pb2.Relationship]:
 44 |     return [
 45 |         semantic_model_pb2.Relationship(
 46 |             name=_PLACEHOLDER_COMMENT,
 47 |             left_table=_PLACEHOLDER_COMMENT,
 48 |             right_table=_PLACEHOLDER_COMMENT,
 49 |             join_type=semantic_model_pb2.JoinType.inner,
 50 |             relationship_columns=[
 51 |                 semantic_model_pb2.RelationKey(
 52 |                     left_column=_PLACEHOLDER_COMMENT,
 53 |                     right_column=_PLACEHOLDER_COMMENT,
 54 |                 )
 55 |             ],
 56 |             relationship_type=semantic_model_pb2.RelationshipType.many_to_one,
 57 |         )
 58 |     ]
 59 | 
 60 | 
 61 | def _raw_table_to_semantic_context_table(
 62 |     database: str, schema: str, raw_table: data_types.Table
 63 | ) -> semantic_model_pb2.Table:
 64 |     """
 65 |     Converts a raw table representation to a semantic model table in protobuf format.
 66 | 
 67 |     Args:
 68 |         database (str): The name of the database containing the table.
 69 |         schema (str): The name of the schema containing the table.
 70 |         raw_table (data_types.Table): The raw table object to be transformed.
 71 | 
 72 |     Returns:
 73 |         semantic_model_pb2.Table: A protobuf representation of the semantic table.
 74 | 
 75 |     This function categorizes table columns into TimeDimensions, Dimensions, or Measures based on their data type,
 76 |     populates them with sample values, and sets placeholders for descriptions and filters.
 77 |     """
 78 | 
 79 |     # For each column, decide if it is a TimeDimension, Measure, or Dimension column.
 80 |     # For now, we decide this based on datatype.
 81 |     # Any time datatype, is TimeDimension.
 82 |     # Any varchar/text is Dimension.
 83 |     # Any numerical column is Measure.
 84 | 
 85 |     time_dimensions = []
 86 |     dimensions = []
 87 |     measures = []
 88 | 
 89 |     for col in raw_table.columns:
 90 |         if col.column_type.upper() in TIME_MEASURE_DATATYPES:
 91 |             time_dimensions.append(
 92 |                 semantic_model_pb2.TimeDimension(
 93 |                     name=col.column_name,
 94 |                     expr=col.column_name,
 95 |                     data_type=col.column_type,
 96 |                     sample_values=col.values,
 97 |                     synonyms=[_PLACEHOLDER_COMMENT],
 98 |                     description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
 99 |                 )
100 |             )
101 | 
102 |         elif col.column_type.upper() in DIMENSION_DATATYPES:
103 |             dimensions.append(
104 |                 semantic_model_pb2.Dimension(
105 |                     name=col.column_name,
106 |                     expr=col.column_name,
107 |                     data_type=col.column_type,
108 |                     sample_values=col.values,
109 |                     synonyms=[_PLACEHOLDER_COMMENT],
110 |                     description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
111 |                 )
112 |             )
113 | 
114 |         elif col.column_type.upper() in MEASURE_DATATYPES:
115 |             measures.append(
116 |                 semantic_model_pb2.Fact(
117 |                     name=col.column_name,
118 |                     expr=col.column_name,
119 |                     data_type=col.column_type,
120 |                     sample_values=col.values,
121 |                     synonyms=[_PLACEHOLDER_COMMENT],
122 |                     description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
123 |                 )
124 |             )
125 |         elif col.column_type.upper() in OBJECT_DATATYPES:
126 |             logger.warning(
127 |                 f"""We don't currently support {col.column_type} as an input column datatype to the Semantic Model. We are skipping column {col.column_name} for now."""
128 |             )
129 |             continue
130 |         else:
131 |             logger.warning(
132 |                 f"Column datatype does not map to a known datatype. Input was = {col.column_type}. We are going to place as a Dimension for now."
133 |             )
134 |             dimensions.append(
135 |                 semantic_model_pb2.Dimension(
136 |                     name=col.column_name,
137 |                     expr=col.column_name,
138 |                     data_type=col.column_type,
139 |                     sample_values=col.values,
140 |                     synonyms=[_PLACEHOLDER_COMMENT],
141 |                     description=col.comment if col.comment else _PLACEHOLDER_COMMENT,
142 |                 )
143 |             )
144 |     if len(time_dimensions) + len(dimensions) + len(measures) == 0:
145 |         raise ValueError(
146 |             f"No valid columns found for table {raw_table.name}. Please verify that this table contains column's datatypes not in {OBJECT_DATATYPES}."
147 |         )
148 | 
149 |     return semantic_model_pb2.Table(
150 |         name=raw_table.name,
151 |         base_table=semantic_model_pb2.FullyQualifiedTable(
152 |             database=database, schema=schema, table=raw_table.name
153 |         ),
154 |         # For fields we can not automatically infer, leave a comment for the user to fill out.
155 |         description=raw_table.comment if raw_table.comment else _PLACEHOLDER_COMMENT,
156 |         filters=_get_placeholder_filter(),
157 |         dimensions=dimensions,
158 |         time_dimensions=time_dimensions,
159 |         measures=measures,
160 |     )
161 | 
162 | 
163 | def raw_schema_to_semantic_context(
164 |     base_tables: List[str],
165 |     semantic_model_name: str,
166 |     conn: SnowflakeConnection,
167 |     n_sample_values: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
168 |     allow_joins: Optional[bool] = False,
169 | ) -> semantic_model_pb2.SemanticModel:
170 |     """
171 |     Converts a list of fully qualified Snowflake table names into a semantic model.
172 | 
173 |     Parameters:
174 |     - base_tables  (list[str]): Fully qualified table names to include in the semantic model.
175 |     - snowflake_account (str): Snowflake account identifier.
176 |     - semantic_model_name (str): A meaningful semantic model name.
177 |     - conn (SnowflakeConnection): SnowflakeConnection to reuse.
178 |     - n_sample_values (int): The number of sample values per col.
179 | 
180 |     Returns:
181 |     - The semantic model (semantic_model_pb2.SemanticModel).
182 | 
183 |     This function fetches metadata for the specified tables, performs schema validation, extracts key information,
184 |     enriches metadata from the Snowflake database, and constructs a semantic model in protobuf format.
185 |     It handles different databases and schemas within the same account by creating unique Snowflake connections as needed.
186 | 
187 |     Raises:
188 |     - AssertionError: If no valid tables are found in the specified schema.
189 |     """
190 | 
191 |     # For FQN tables, create a new snowflake connection per table in case the db/schema is different.
192 |     table_objects = []
193 |     unique_database_schema: List[str] = []
194 |     for table in base_tables:
195 |         # Verify this is a valid FQN table. For now, we check that the table follows the following format.
196 |         # {database}.{schema}.{table}
197 |         fqn_table = create_fqn_table(table)
198 |         fqn_databse_schema = f"{fqn_table.database}.{fqn_table.schema_name}"
199 | 
200 |         if fqn_databse_schema not in unique_database_schema:
201 |             unique_database_schema.append(fqn_databse_schema)
202 | 
203 |         logger.info(f"Pulling column information from {fqn_table}")
204 |         valid_schemas_tables_columns_df = get_valid_schemas_tables_columns_df(
205 |             conn=conn,
206 |             db_name=fqn_table.database,
207 |             table_schema=fqn_table.schema_name,
208 |             table_names=[fqn_table.table],
209 |         )
210 |         assert not valid_schemas_tables_columns_df.empty
211 | 
212 |         # get the valid columns for this table.
213 |         valid_columns_df_this_table = valid_schemas_tables_columns_df[
214 |             valid_schemas_tables_columns_df["TABLE_NAME"] == fqn_table.table
215 |         ]
216 | 
217 |         raw_table = get_table_representation(
218 |             conn=conn,
219 |             schema_name=fqn_databse_schema,  # Fully-qualified schema
220 |             table_name=fqn_table.table,  # Non-qualified table name
221 |             table_index=0,
222 |             ndv_per_column=n_sample_values,  # number of sample values to pull per column.
223 |             columns_df=valid_columns_df_this_table,
224 |             max_workers=1,
225 |         )
226 |         table_object = _raw_table_to_semantic_context_table(
227 |             database=fqn_table.database,
228 |             schema=fqn_table.schema_name,
229 |             raw_table=raw_table,
230 |         )
231 |         table_objects.append(table_object)
232 |     # TODO(jhilgart): Call cortex model to generate a semantically friendly name here.
233 | 
234 |     placeholder_relationships = _get_placeholder_joins() if allow_joins else None
235 |     context = semantic_model_pb2.SemanticModel(
236 |         name=semantic_model_name,
237 |         tables=table_objects,
238 |         relationships=placeholder_relationships,
239 |     )
240 |     return context
241 | 
242 | 
243 | def comment_out_section(yaml_str: str, section_name: str) -> str:
244 |     """
245 |     Comments out all lines in the specified section of a YAML string.
246 | 
247 |     Parameters:
248 |     - yaml_str (str): The YAML string to process.
249 |     - section_name (str): The name of the section to comment out.
250 | 
251 |     Returns:
252 |     - str: The modified YAML string with the specified section commented out.
253 |     """
254 |     updated_yaml = []
255 |     lines = yaml_str.split("\n")
256 |     in_section = False
257 |     section_indent_level = 0
258 | 
259 |     for line in lines:
260 |         stripped_line = line.strip()
261 | 
262 |         # When we find a section with the provided name, we can start commenting out lines.
263 |         if stripped_line.startswith(f"{section_name}:"):
264 |             in_section = True
265 |             section_indent_level = len(line) - len(line.lstrip())
266 |             comment_indent = " " * section_indent_level
267 |             updated_yaml.append(f"{comment_indent}# {line.strip()}")
268 |             continue
269 | 
270 |         # Since this method parses a raw YAML string, we track whether we're in the section by the indentation level.
271 |         # This is a pretty rough heuristic.
272 |         current_indent_level = len(line) - len(line.lstrip())
273 |         if (
274 |             in_section
275 |             and current_indent_level <= section_indent_level
276 |             and stripped_line
277 |         ):
278 |             in_section = False
279 | 
280 |         # Comment out the field and its subsections, preserving the indentation level.
281 |         if in_section and line.strip():
282 |             comment_indent = " " * current_indent_level
283 |             updated_yaml.append(f"{comment_indent}# {line.strip()}")
284 |         else:
285 |             updated_yaml.append(line)
286 | 
287 |     return "\n".join(updated_yaml)
288 | 
289 | 
290 | def append_comment_to_placeholders(yaml_str: str) -> str:
291 |     """
292 |     Finds all instances of a specified placeholder in a YAML string and appends a given text to these placeholders.
293 |     This is the homework to fill out after your yaml is generated.
294 | 
295 |     Parameters:
296 |     - yaml_str (str): The YAML string to process.
297 | 
298 |     Returns:
299 |     - str: The modified YAML string with appended text to placeholders.
300 |     """
301 |     updated_yaml = []
302 |     # Split the string into lines to process each line individually
303 |     lines = yaml_str.split("\n")
304 | 
305 |     for line in lines:
306 |         # Check if the placeholder is in the current line.
307 |         # Strip the last quote to match.
308 |         if line.rstrip("'").endswith(_PLACEHOLDER_COMMENT):
309 |             # Replace the _PLACEHOLDER_COMMENT with itself plus the append_text
310 |             updated_line = line + _FILL_OUT_TOKEN
311 |             updated_yaml.append(updated_line)
312 |         elif line.rstrip("'").endswith(AUTOGEN_TOKEN):
313 |             updated_line = line + _AUTOGEN_COMMENT_TOKEN
314 |             updated_yaml.append(updated_line)
315 |         # Add comments to specific fields in certain sections.
316 |         elif line.lstrip().startswith("join_type"):
317 |             updated_line = line + _FILL_OUT_TOKEN + "  supported: inner, left_outer"
318 |             updated_yaml.append(updated_line)
319 |         elif line.lstrip().startswith("relationship_type"):
320 |             updated_line = (
321 |                 line + _FILL_OUT_TOKEN + " supported: many_to_one, one_to_one"
322 |             )
323 |             updated_yaml.append(updated_line)
324 |         else:
325 |             updated_yaml.append(line)
326 | 
327 |     # Join the lines back together into a single string
328 |     return "\n".join(updated_yaml)
329 | 
330 | 
331 | def _to_snake_case(s: str) -> str:
332 |     """
333 |     Convert a string into snake case.
334 | 
335 |     Parameters:
336 |     s (str): The string to convert.
337 | 
338 |     Returns:
339 |     str: The snake case version of the string.
340 |     """
341 |     # Replace common delimiters with spaces
342 |     s = s.replace("-", " ").replace("_", " ")
343 | 
344 |     words = s.split(" ")
345 | 
346 |     # Convert each word to lowercase and join with underscores
347 |     snake_case_str = "_".join([word.lower() for word in words if word]).strip()
348 | 
349 |     return snake_case_str
350 | 
351 | 
352 | def generate_base_semantic_model_from_snowflake(
353 |     base_tables: List[str],
354 |     conn: SnowflakeConnection,
355 |     semantic_model_name: str,
356 |     n_sample_values: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
357 |     output_yaml_path: Optional[str] = None,
358 | ) -> None:
359 |     """
360 |     Generates a base semantic context from specified Snowflake tables and exports it to a YAML file.
361 | 
362 |     Parameters:
363 |         base_tables : Fully qualified names of Snowflake tables to include in the semantic context.
364 |         conn: SnowflakeConnection to reuse.
365 |         snowflake_account: Identifier of the Snowflake account.
366 |         semantic_model_name: The human readable model name. This should be semantically meaningful to an organization.
367 |         output_yaml_path: Path for the output YAML file. If None, defaults to 'semantic_model_generator/output_models/YYYYMMDDHHMMSS_<semantic_model_name>.yaml'.
368 |         n_sample_values: The number of sample values to populate for all columns.
369 | 
370 |     Returns:
371 |         None. Writes the semantic context to a YAML file.
372 |     """
373 |     formatted_datetime = datetime.now().strftime("%Y%m%d%H%M%S")
374 |     if not output_yaml_path:
375 |         file_name = f"{formatted_datetime}_{_to_snake_case(semantic_model_name)}.yaml"
376 |         if os.path.exists("semantic_model_generator/output_models"):
377 |             write_path = f"semantic_model_generator/output_models/{file_name}"
378 |         else:
379 |             write_path = f"./{file_name}"
380 |     else:  # Assume user gives correct path.
381 |         write_path = output_yaml_path
382 | 
383 |     yaml_str = generate_model_str_from_snowflake(
384 |         base_tables,
385 |         n_sample_values=n_sample_values if n_sample_values > 0 else 1,
386 |         semantic_model_name=semantic_model_name,
387 |         conn=conn,
388 |     )
389 | 
390 |     with open(write_path, "w") as f:
391 |         # Clarify that the YAML was autogenerated and that placeholders should be filled out/deleted.
392 |         f.write(_AUTOGEN_COMMENT_WARNING)
393 |         f.write(yaml_str)
394 | 
395 |     logger.info(f"Semantic model saved to {write_path}")
396 | 
397 |     return None
398 | 
399 | 
400 | def generate_model_str_from_snowflake(
401 |     base_tables: List[str],
402 |     semantic_model_name: str,
403 |     conn: SnowflakeConnection,
404 |     n_sample_values: int = _DEFAULT_N_SAMPLE_VALUES_PER_COL,
405 |     allow_joins: Optional[bool] = False,
406 | ) -> str:
407 |     """
408 |     Generates a base semantic context from specified Snowflake tables and returns the raw string.
409 | 
410 |     Parameters:
411 |         base_tables : Fully qualified names of Snowflake tables to include in the semantic context.
412 |         semantic_model_name: The human readable model name. This should be semantically meaningful to an organization.
413 |         conn: SnowflakeConnection to reuse.
414 |         n_sample_values: The number of sample values to populate for all columns.
415 |         allow_joins: Whether to allow joins in the semantic context.
416 | 
417 |     Returns:
418 |         str: The raw string of the semantic context.
419 |     """
420 |     context = raw_schema_to_semantic_context(
421 |         base_tables,
422 |         n_sample_values=n_sample_values if n_sample_values > 0 else 1,
423 |         semantic_model_name=semantic_model_name,
424 |         allow_joins=allow_joins,
425 |         conn=conn,
426 |     )
427 |     # Validate the generated yaml is within context limits.
428 |     # We just throw a warning here to allow users to update.
429 |     validate_context_length(context)
430 | 
431 |     yaml_str = proto_utils.proto_to_yaml(context)
432 |     # Once we have the yaml, update to include to # <FILL-OUT> tokens.
433 |     yaml_str = append_comment_to_placeholders(yaml_str)
434 |     # Comment out the filters section as we don't have a way to auto-generate these yet.
435 |     yaml_str = comment_out_section(yaml_str, "filters")
436 |     yaml_str = comment_out_section(yaml_str, "relationships")
437 | 
438 |     return yaml_str
439 | 


--------------------------------------------------------------------------------
/semantic_model_generator/output_models/.keep:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Snowflake-Labs/semantic-model-generator/2bf84ac8d2a11d7e9468c03390cc97967530f6a0/semantic_model_generator/output_models/.keep


--------------------------------------------------------------------------------
/semantic_model_generator/protos/semantic_model.proto:
--------------------------------------------------------------------------------
  1 | // If you make changes to this file, you'll need to run protoc to updated the
  2 | // generated files by running the following command:
  3 | //
  4 | // python -m grpc_tools.protoc -I=semantic_model_generator/protos/ --python_out=semantic_model_generator/protos/ --pyi_out=semantic_model_generator/protos/ semantic_model_generator/protos/semantic_model.proto
  5 | 
  6 | syntax = "proto3";
  7 | 
  8 | import "google/protobuf/descriptor.proto";
  9 | 
 10 | package semantic_model_generator;
 11 | 
 12 | // We are using proto FieldOptions to validate the YAMLs match the expected
 13 | // schema. Any fields with the `optional` option, are not required during YAML
 14 | // validation.
 15 | extend google.protobuf.FieldOptions {
 16 |   optional bool optional = 51234;
 17 |   optional bool sql_expression = 51235;
 18 |   optional bool id_field = 51236;
 19 | }
 20 | 
 21 | // AggregationType defines a list of various aggregations.
 22 | enum AggregationType {
 23 |   aggregation_type_unknown = 0;
 24 |   sum = 1;
 25 |   avg = 2;
 26 |   median = 7;
 27 |   min = 3;
 28 |   max = 4;
 29 |   count = 5;
 30 |   count_distinct = 6;
 31 | }
 32 | 
 33 | // ColumnKind defines various kinds of columns, mainly categorized into
 34 | // dimensions and measures.
 35 | enum ColumnKind {
 36 |   column_kind_unknown = 0;
 37 |   // A column containing categorical values such as names, countries, dates.
 38 |   dimension = 1;
 39 |   // A column containing numerical values such as revenue, impressions, salary.
 40 |   // TODO: migrate to fact.
 41 |   measure = 2;
 42 |   // A column containing date/time data.
 43 |   time_dimension = 3;
 44 |   // A "column" containing calculations about an entity such as sum_revenue,
 45 |   // cvr.
 46 |   metric = 4;
 47 | }
 48 | 
 49 | message RetrievalResult {
 50 |   string value = 1;
 51 |   float score = 2;
 52 | }
 53 | 
 54 | // Column is analogous to a database column and defines various semantic
 55 | // properties of a column.  A column can either simply be a column in the base
 56 | // database schema or it can be an arbitrary expression over the base schema,
 57 | // e.g. `base_column1 + base_column2`.
 58 | message Column {
 59 |   // A descriptive name for this column.
 60 |   string name = 1 [ (id_field) = true ];
 61 |   // A list of other terms/phrases used to refer to this column.
 62 |   repeated string synonyms = 2 [ (optional) = true ];
 63 |   // A brief description about this column, including things like what data this
 64 |   // column has.
 65 |   string description = 3 [ (optional) = true ];
 66 |   // The SQL expression for this column. Could simply be a base table column
 67 |   // name or an arbitrary SQL expression over one or more columns of the base
 68 |   // table.
 69 |   string expr = 4 [ (sql_expression) = true ];
 70 |   // The data type of this column.
 71 |   // TODO(nsehrawat): Consider creating an enum instead, with all snowflake
 72 |   // support data types.
 73 |   string data_type = 5;
 74 |   // The kind of this column - dimension or fact, metric.
 75 |   ColumnKind kind = 6;
 76 |   // If true, assume that this column has unique values.
 77 |   bool unique = 7 [ (optional) = true ];
 78 |   // If no aggregation is specified, then this is the default aggregation
 79 |   // applied to this column in contxt of a grouping.
 80 |   AggregationType default_aggregation = 8 [ (optional) = true, deprecated = true ];
 81 |   // Sample values of this column.
 82 |   repeated string sample_values = 9 [ (optional) = true ];
 83 |   // Whether to index the values and retrieve them based on the question.
 84 |   // If False, all sample values will be used as input to the model.
 85 |   bool index_and_retrieve_values = 10 [ (optional) = true ];
 86 |   // Retrieved literals of this column.
 87 |   repeated RetrievalResult retrieved_literals = 11 [ (optional) = true ];
 88 | 
 89 |   // A Cortex Search Service configured on this column to retrieve literals.
 90 |   string cortex_search_service_name = 12
 91 |       [ (optional) = true, deprecated = true ];
 92 |   CortexSearchService cortex_search_service = 13 [ (optional) = true ];
 93 |   // If true, this column has limited possible values, all of which are in
 94 |   // the sample_values field.
 95 |   bool is_enum = 14 [ (optional) = true ];
 96 | }
 97 | 
 98 | // Dimension columns contain categorical values (e.g. state, user_type,
 99 | // platform). NOTE: If modifying this protobuf, make appropriate changes in
100 | // context_to_column_format() of snowpilot/semantic_context/protos/schema.py.
101 | message Dimension {
102 |   // A descriptive name for this dimension.
103 |   string name = 1 [ (id_field) = true ];
104 |   // A list of other terms/phrases used to refer to this dimension.
105 |   repeated string synonyms = 2 [ (optional) = true ];
106 |   // A brief description about this dimension, including things like
107 |   // what data this dimension has.
108 |   string description = 3 [ (optional) = true ];
109 |   // The SQL expression defining this dimension. Could simply be a physical
110 |   // column name or an arbitrary SQL expression over one or more columns of the
111 |   // physical table.
112 |   string expr = 4 [ (sql_expression) = true ];
113 |   // The data type of this dimension.
114 |   // TODO(nsehrawat): Consider creating an enum instead with all snowflake
115 |   // support data types.
116 |   string data_type = 5;
117 |   // If true, assume that this dimension has unique values.
118 |   bool unique = 6 [ (optional) = true ];
119 |   // Sample values of this column.
120 |   repeated string sample_values = 7 [ (optional) = true ];
121 |   // A Cortex Search Service configured on this column to retrieve literals.
122 |   CortexSearchService cortex_search_service = 8 [ (optional) = true ];
123 |   string cortex_search_service_name = 9
124 |       [ (optional) = true, deprecated = true ];
125 |   // If true, this column has limited possible values, all of which are in
126 |   // the sample_values field.
127 |   bool is_enum = 10 [ (optional) = true ];
128 | }
129 | 
130 | // Fully qualified Cortex Search Service name.
131 | message CortexSearchService {
132 |   string database = 1 [ (optional) = true ];
133 |   string schema = 2 [ (optional) = true ];
134 |   string service = 3;
135 |   string literal_column = 4 [ (optional) = true ];
136 | }
137 | 
138 | // Time dimension columns contain time values (e.g. sale_date, created_at,
139 | // year). NOTE: If modifying this protobuf, make appropriate changes in
140 | // to_column_format() of snowpilot/semantic_context/utils/utils.py.
141 | message TimeDimension {
142 |   // A descriptive name for this time dimension.
143 |   string name = 1 [ (id_field) = true ];
144 |   // A list of other terms/phrases used to refer to this time dimension.
145 |   repeated string synonyms = 2 [ (optional) = true ];
146 |   // A brief description about this time dimension, including things like
147 |   // what data it has, the timezone of values, etc.
148 |   string description = 3 [ (optional) = true ];
149 |   // The SQL expression defining this time dimension. Could simply be a physical
150 |   // column name or an arbitrary SQL expression over one or more columns of the
151 |   // physical table.
152 |   string expr = 4 [ (sql_expression) = true ];
153 |   // The data type of this time dimension.
154 |   // TODO(nsehrawat): Consider creating an enum instead, with all snowflake
155 |   // support data types.
156 |   string data_type = 5;
157 |   // If true, assume that this time dimension has unique values.
158 |   bool unique = 6 [ (optional) = true ];
159 |   // Sample values of this time dimension.
160 |   repeated string sample_values = 7 [ (optional) = true ];
161 | }
162 | 
163 | // Measure columns contain numerical values (e.g. revenue, impressions, salary).
164 | // NOTE: If modifying this protobuf, make appropriate changes in
165 | // to_column_format() of snowpilot/semantic_context/utils/utils.py.
166 | message Fact {
167 |   // A descriptive name for this measure.
168 |   string name = 1 [ (id_field) = true ];
169 |   // A list of other terms/phrases used to refer to this measure.
170 |   repeated string synonyms = 2 [ (optional) = true ];
171 |   // A brief description about this measure, including things like what data
172 |   // it has.
173 |   string description = 3 [ (optional) = true ];
174 |   // The SQL expression defining this measure. Could simply be a physical column
175 |   // name or an arbitrary SQL expression over one or more physical columns of
176 |   // the underlying physical table.
177 |   string expr = 4 [ (sql_expression) = true ];
178 |   // The data type of this measure.
179 |   // TODO(nsehrawat): Consider creating an enum instead, with all snowflake
180 |   // support data types.
181 |   string data_type = 5;
182 |   // If no aggregation is specified, then this is the default aggregation
183 |   // applied to this measure in contxt of a grouping.
184 |   AggregationType default_aggregation = 6 [ (optional) = true , deprecated = true ];
185 |   // Sample values of this measure.
186 |   repeated string sample_values = 7 [ (optional) = true ];
187 | }
188 | 
189 | // Filter represents a named SQL expression that's used for filtering.
190 | // TODO: add validation. we should only support where clause style filter (no
191 | // aggregations) and reject having clauses.
192 | message NamedFilter {
193 |   // A descriptive name for this filter.
194 |   string name = 1;
195 |   // A list of other term/phrases used to refer to this column.
196 |   repeated string synonyms = 2 [ (optional) = true ];
197 |   // A brief description about this column, including details of what this
198 |   // filter is typically used for.
199 |   string description = 3 [ (optional) = true ];
200 |   // The SQL expression of this filter.
201 |   string expr = 4 [ (sql_expression) = true ];
202 | }
203 | 
204 | // FullyQualifiedTable is used to represent three part table names -
205 | // (database, schema, table).
206 | message FullyQualifiedTable {
207 |   string database = 1;
208 |   string schema = 2;
209 |   string table = 3;
210 | }
211 | 
212 | // Defines a primary key of a table. In the general case, primary keys
213 | // are a collection of columns of the table.
214 | // For discussion: PK FK are potentially duplicative to join path in a semantic
215 | // model. However, it implies uniqueness which can be informative for getting
216 | // right aggregation level. For that reason, we are exposing only the PrimaryKey
217 | // currently. Join paths seem more extensible than foreign keys for supporting
218 | // join. Further experimentation is needed to see if JoinPath and ForeignKey can
219 | // yield similar results.
220 | message PrimaryKey {
221 |   // Base column names that constitute the primary key.
222 |   repeated string columns = 1;
223 | }
224 | 
225 | // Defines a foreign key that references the primary key of another table.
226 | message ForeignKey {
227 |   // Base column names of the foreign key table.
228 |   repeated string fkey_columns = 1;
229 |   // The primary key table that this foreign key references.
230 |   FullyQualifiedTable pkey_table = 2;
231 |   // Base column names of the primary key table.
232 |   repeated string pkey_columns = 3;
233 | }
234 | 
235 | // Table is analogous to a database table and provides a simple view over an
236 | // existing database table.  A table can leave out some columns from the base
237 | // table and/or introduce new derived columns.
238 | message Table {
239 |   // A descriptive name for this table.
240 |   string name = 1 [ (id_field) = true ];
241 |   // A list of other term/phrases used to refer to this table.
242 |   repeated string synonyms = 2 [ (optional) = true ];
243 |   // A brief description of this table, including details of what kinds of
244 |   // analysis is it typically used for.
245 |   string description = 3 [ (optional) = true ];
246 |   // Fully qualified name of the underlying base table.
247 |   FullyQualifiedTable base_table = 4;
248 | 
249 |   // We allow two formats for specifying logical columns of a table:
250 |   // 1. As a list of columns.
251 |   // 2. As three separate list of dimensions, time dimensions, and measures.
252 |   // For the external facing yaml specification, we have chosen to go with (2).
253 |   // However, for the time being we'll support both (1) and (2) and continue
254 |   // using (1) as the internal representation.
255 |   repeated Column columns = 5 [ (optional) = true ];
256 |   repeated Dimension dimensions = 9 [ (optional) = true ];
257 |   repeated TimeDimension time_dimensions = 10 [ (optional) = true ];
258 |   repeated Fact measures = 11 [ (optional) = true, deprecated = true ];
259 |   repeated Fact facts = 12 [ (optional) = true ];
260 |   repeated Metric metrics = 13 [ (optional) = true ];
261 | 
262 |   // Primary key of the table, if any.
263 |   PrimaryKey primary_key = 6 [ (optional) = true ];
264 |   // Foreign keys of the table, if any.
265 |   repeated ForeignKey foreign_keys = 7 [ (optional) = true ];
266 |   // Predefined filters on this table, if any.
267 |   repeated NamedFilter filters = 8 [ (optional) = true ];
268 |   // NEXT_TAG: 14.
269 | }
270 | 
271 | // Metric are named computation over a collection of columns.  For now, we
272 | // only allow a metric to be defined over columns from a single table. In
273 | // future, we'll expand to allowing metrics that refer to columns from multiple
274 | // tables.
275 | message Metric {
276 |   // A descriptive name of the metric.
277 |   string name = 1 [ (id_field) = true ];
278 |   // A list of other term/phrases used to refer to this metric.
279 |   repeated string synonyms = 2 [ (optional) = true ];
280 |   // A brief description of this metric, including details of what it computes.
281 |   string description = 3 [ (optional) = true ];
282 |   // The SQL expression to compute this metric.
283 |   // All columns used must be fully qualified with the logical table name.
284 |   // Expression must be an aggregate
285 |   string expr = 4 [ (sql_expression) = true ];
286 |   // The filter associated with this metric.
287 |   // Do not expose this for now.
288 |   MetricsFilter filter = 5 [ (optional) = true ];
289 | }
290 | 
291 | message MetricsFilter { string expr = 1 [ (sql_expression) = true ]; }
292 | 
293 | // Type of the join - inner, left outer, etc.
294 | enum JoinType {
295 |   join_type_unknown = 0;
296 |   inner = 1;
297 |   left_outer = 2;
298 |   full_outer = 3 [ deprecated = true ];
299 |   cross = 4 [ deprecated = true ];
300 |   right_outer = 5 [ deprecated = true ];
301 | }
302 | 
303 | // Type of the relationship - one-to-one, many-to-one, etc.
304 | enum RelationshipType {
305 |   relationship_type_unknown = 0;
306 |   one_to_one = 1;
307 |   many_to_one = 2;
308 |   one_to_many = 3 [ deprecated = true ];
309 |   many_to_many = 4 [ deprecated = true ];
310 | }
311 | 
312 | message RelationKey {
313 |   // Only support equi-join relationship for now.
314 |   string left_column = 1;
315 |   string right_column = 2;
316 | }
317 | 
318 | // Relationship represents a join between two tables.
319 | message Relationship {
320 |   // A unique name of the join.
321 |   string name = 1;
322 |   // The left hand side table of the join.
323 |   string left_table = 2;
324 |   // The right hand side table of the join.
325 |   string right_table = 3;
326 |   // The expression used to join left and right tables. Only used internally.
327 |   string expr = 4 [ (sql_expression) = true, (optional) = true ];
328 |   // Keys directly represent the join relationship.
329 |   repeated RelationKey relationship_columns = 7 [ (optional) = true ];
330 |   // Type of the join.
331 |   JoinType join_type = 5;
332 |   // Type of the relationship.
333 |   RelationshipType relationship_type = 6;
334 | }
335 | 
336 | // A message that encapsulates custom instructions for each module.
337 | message ModuleCustomInstructions {
338 |   // Custom instructions for SQL Generation.
339 |   string sql_generation = 1 [ (optional) = true ];
340 |   // Custom instructions for Question Categorization.
341 |   string question_categorization = 2 [ (optional) = true ];
342 | }
343 | 
344 | // The semantic context relevant to generating SQL for answering a data
345 | // question.
346 | message SemanticModel {
347 |   // A descriptive name of the project.
348 |   string name = 1;
349 |   // A brief description of this project, including details of what kind of
350 |   // analysis does this project enable.
351 |   string description = 2 [ (optional) = true ];
352 |   // List of tables in this project.
353 |   repeated Table tables = 3;
354 |   // List of relationships in this project.
355 |   repeated Relationship relationships = 5 [ (optional) = true ];
356 |   // List of verified queries for this semantic model.
357 |   repeated VerifiedQuery verified_queries = 6 [ (optional) = true ];
358 |   // Custom instructions that will be applied to the final SQL generation.
359 |   string custom_instructions = 7 [ (optional) = true ];
360 |   // Module-specific custom instructions. The SQL generation instruction here
361 |   // will take precedence over the legacy custom_instructions if it exists.
362 |   ModuleCustomInstructions module_custom_instructions = 8 [ (optional) = true ];
363 | }
364 | 
365 | // VerifiedQuery represents a (question, sql) pair that has been manually
366 | // verified (e.g. by an analyst) to be correct.
367 | message VerifiedQuery {
368 |   // A name for this verified query. Mainly used for display purposes.
369 |   string name = 1;
370 |   // The name of the semantic model on which this verified query is based off.
371 |   string semantic_model_name = 2 [ (optional) = true ];
372 |   // The question being answered.
373 |   string question = 3;
374 |   // The correct SQL query for answering the question.
375 |   string sql = 4 [ (sql_expression) = true ];
376 |   // Timestamp at which the query was last verified - measures in seconds since
377 |   // epoch, in UTC.
378 |   int64 verified_at = 5 [ (optional) = true ];
379 |   // Name of the person who verified this query.
380 |   string verified_by = 6 [ (optional) = true ];
381 |   // Whether to always include in this question in the suggested questions
382 |   // module
383 |   bool use_as_onboarding_question = 7 [ (optional) = true ];
384 | }
385 | 
386 | // VerifiedQueryRepository is a simply a collection of verified queries.
387 | message VerifiedQueryRepository { repeated VerifiedQuery verified_queries = 1; }
388 | 


--------------------------------------------------------------------------------
/semantic_model_generator/snowflake_utils/env_vars.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from dotenv import load_dotenv
 4 | 
 5 | load_dotenv(override=True)
 6 | DEFAULT_SESSION_TIMEOUT_SEC = int(os.environ.get("SNOWFLAKE_SESSION_TIMEOUT_SEC", 120))
 7 | SNOWFLAKE_ROLE = os.getenv("SNOWFLAKE_ROLE")
 8 | SNOWFLAKE_WAREHOUSE = os.getenv("SNOWFLAKE_WAREHOUSE")
 9 | SNOWFLAKE_USER = os.getenv("SNOWFLAKE_USER")
10 | SNOWFLAKE_PASSWORD = os.getenv("SNOWFLAKE_PASSWORD")
11 | SNOWFLAKE_HOST = os.getenv("SNOWFLAKE_HOST")
12 | SNOWFLAKE_AUTHENTICATOR = os.getenv("SNOWFLAKE_AUTHENTICATOR")
13 | SNOWFLAKE_ACCOUNT_LOCATOR = os.getenv("SNOWFLAKE_ACCOUNT_LOCATOR")
14 | 
15 | # Optional MFA environment variables
16 | SNOWFLAKE_MFA_PASSCODE = os.getenv("SNOWFLAKE_MFA_PASSCODE")
17 | SNOWFLAKE_MFA_PASSCODE_IN_PASSWORD = os.getenv("SNOWFLAKE_MFA_PASSCODE_IN_PASSWORD")
18 | 
19 | 
20 | def assert_required_env_vars() -> list[str]:
21 |     """
22 |     Ensures that the required environment variables are set before proceeding.
23 |     Returns: list of missing required environment variables
24 | 
25 |     """
26 | 
27 |     missing_env_vars = []
28 |     if not SNOWFLAKE_ROLE:
29 |         missing_env_vars.append("SNOWFLAKE_ROLE")
30 |     if not SNOWFLAKE_WAREHOUSE:
31 |         missing_env_vars.append("SNOWFLAKE_WAREHOUSE")
32 |     if not SNOWFLAKE_USER:
33 |         missing_env_vars.append("SNOWFLAKE_USER")
34 |     if not SNOWFLAKE_ACCOUNT_LOCATOR:
35 |         missing_env_vars.append("SNOWFLAKE_ACCOUNT_LOCATOR")
36 |     if not SNOWFLAKE_HOST:
37 |         missing_env_vars.append("SNOWFLAKE_HOST")
38 |     if not SNOWFLAKE_PASSWORD and not SNOWFLAKE_AUTHENTICATOR:
39 |         missing_env_vars.append("SNOWFLAKE_PASSWORD/SNOWFLAKE_AUTHENTICATOR")
40 | 
41 |     # Assert that SNOWFLAKE_PASSWORD is required unless the user is using the externalbrowser authenticator
42 |     if (
43 |         SNOWFLAKE_AUTHENTICATOR
44 |         and SNOWFLAKE_AUTHENTICATOR.lower() != "externalbrowser"
45 |         and not SNOWFLAKE_PASSWORD
46 |     ):
47 |         missing_env_vars.append("SNOWFLAKE_PASSWORD")
48 | 
49 |     return missing_env_vars
50 | 


--------------------------------------------------------------------------------
/semantic_model_generator/snowflake_utils/utils.py:
--------------------------------------------------------------------------------
 1 | from typing import Dict, Optional, Union
 2 | 
 3 | from snowflake.connector import connect
 4 | from snowflake.connector.connection import SnowflakeConnection
 5 | 
 6 | from semantic_model_generator.data_processing.data_types import FQNParts
 7 | 
 8 | 
 9 | def create_fqn_table(fqn_str: str) -> FQNParts:
10 |     if fqn_str.count(".") != 2:
11 |         raise ValueError(
12 |             "Expected to have a table fully qualified name following the {database}.{schema}.{table} format."
13 |             + f"Instead found {fqn_str}"
14 |         )
15 |     database, schema, table = fqn_str.split(".")
16 |     return FQNParts(
17 |         database=database.upper(), schema_name=schema.upper(), table=table.upper()
18 |     )
19 | 
20 | 
21 | def create_connection_parameters(
22 |     user: str,
23 |     account: str,
24 |     password: Optional[str] = None,
25 |     host: Optional[str] = None,
26 |     role: Optional[str] = None,
27 |     warehouse: Optional[str] = None,
28 |     database: Optional[str] = None,
29 |     schema: Optional[str] = None,
30 |     authenticator: Optional[str] = None,
31 |     passcode: Optional[str] = None,
32 |     passcode_in_password: Optional[bool] = None,
33 | ) -> Dict[str, Union[str, bool]]:
34 |     connection_parameters: Dict[str, Union[str, bool]] = dict(
35 |         user=user, account=account
36 |     )
37 |     if password:
38 |         connection_parameters["password"] = password
39 |     if role:
40 |         connection_parameters["role"] = role
41 |     if warehouse:
42 |         connection_parameters["warehouse"] = warehouse
43 |     if database:
44 |         connection_parameters["database"] = database
45 |     if schema:
46 |         connection_parameters["schema"] = schema
47 |     if authenticator:
48 |         connection_parameters["authenticator"] = authenticator
49 |     if host:
50 |         connection_parameters["host"] = host
51 |     if passcode:
52 |         connection_parameters["passcode"] = passcode
53 |     if passcode_in_password:
54 |         connection_parameters["passcode_in_password"] = passcode_in_password
55 |     return connection_parameters
56 | 
57 | 
58 | def _connection(
59 |     connection_parameters: Dict[str, Union[str, bool]]
60 | ) -> SnowflakeConnection:
61 |     # https://docs.snowflake.com/en/developer-guide/python-connector/python-connector-connect
62 |     return connect(**connection_parameters)
63 | 
64 | 
65 | def snowflake_connection(
66 |     user: str,
67 |     account: str,
68 |     role: str,
69 |     warehouse: str,
70 |     password: Optional[str] = None,
71 |     host: Optional[str] = None,
72 |     authenticator: Optional[str] = None,
73 |     passcode: Optional[str] = None,
74 |     passcode_in_password: Optional[bool] = None,
75 | ) -> SnowflakeConnection:
76 |     """
77 |     Returns a Snowflake Connection to the specified account.
78 |     """
79 |     return _connection(
80 |         create_connection_parameters(
81 |             user=user,
82 |             password=password,
83 |             host=host,
84 |             account=account,
85 |             role=role,
86 |             warehouse=warehouse,
87 |             authenticator=authenticator,
88 |             passcode=passcode,
89 |             passcode_in_password=passcode_in_password,
90 |         )
91 |     )
92 | 


--------------------------------------------------------------------------------
/semantic_model_generator/tests/snowflake_connector_test.py:
--------------------------------------------------------------------------------
  1 | from unittest import mock
  2 | from unittest.mock import MagicMock, call, patch
  3 | 
  4 | import pandas as pd
  5 | import pytest
  6 | from pandas.testing import assert_frame_equal
  7 | 
  8 | from semantic_model_generator.data_processing.data_types import Column, Table
  9 | from semantic_model_generator.snowflake_utils import snowflake_connector
 10 | 
 11 | 
 12 | @pytest.fixture
 13 | def mock_snowflake_connection_env(monkeypatch):
 14 |     # Mock environment variable
 15 |     monkeypatch.setenv("SNOWFLAKE_HOST", "test_host")
 16 | 
 17 |     # Use this fixture to also patch instance methods if needed
 18 |     with patch.object(
 19 |         snowflake_connector.SnowflakeConnector, "_get_user", return_value="test_user"
 20 |     ), patch.object(
 21 |         snowflake_connector.SnowflakeConnector,
 22 |         "_get_password",
 23 |         return_value="test_password",
 24 |     ), patch.object(
 25 |         snowflake_connector.SnowflakeConnector, "_get_role", return_value="test_role"
 26 |     ), patch.object(
 27 |         snowflake_connector.SnowflakeConnector,
 28 |         "_get_warehouse",
 29 |         return_value="test_warehouse",
 30 |     ), patch.object(
 31 |         snowflake_connector.SnowflakeConnector, "_get_host", return_value="test_host"
 32 |     ):
 33 |         yield
 34 | 
 35 | 
 36 | @pytest.fixture
 37 | def schemas_tables_columns() -> pd.DataFrame:
 38 |     return pd.DataFrame(
 39 |         columns=[
 40 |             "TABLE_SCHEMA",
 41 |             "TABLE_NAME",
 42 |             "COLUMN_NAME",
 43 |             "DATA_TYPE",
 44 |             "COLUMN_COMMENT",
 45 |         ],
 46 |         data=[
 47 |             ["TEST_SCHEMA_1", "table_1", "col_1", "VARCHAR", None],
 48 |             ["TEST_SCHEMA_1", "table_1", "col_2", "NUMBER", None],
 49 |             ["TEST_SCHEMA_1", "table_2", "col_1", "NUMBER", "table_2_col_1_comment"],
 50 |             [
 51 |                 "TEST_SCHEMA_1",
 52 |                 "table_2",
 53 |                 "col_2",
 54 |                 "TIMESTAMP_NTZ",
 55 |                 "table_2_col_2_comment",
 56 |             ],
 57 |             ["TEST_SCHEMA_2", "table_3", "col_1", "VARIANT", None],
 58 |             [
 59 |                 "TEST_SCHEMA_2",
 60 |                 "invalid_table",
 61 |                 "col_1",
 62 |                 "VARIANT",
 63 |                 "invalid_table_col_1_comment",
 64 |             ],
 65 |         ],
 66 |     )
 67 | 
 68 | 
 69 | @pytest.fixture
 70 | def valid_tables() -> pd.DataFrame:
 71 |     return pd.DataFrame(
 72 |         columns=["TABLE_SCHEMA", "TABLE_NAME", "TABLE_COMMENT"],
 73 |         data=[
 74 |             ["TEST_SCHEMA_1", "table_1", None],
 75 |             ["TEST_SCHEMA_1", "table_2", "table_2_comment"],
 76 |             ["TEST_SCHEMA_2", "table_3", "table_3_comment"],
 77 |         ],
 78 |     )
 79 | 
 80 | 
 81 | _TEST_TABLE_ONE = Table(
 82 |     id_=0,
 83 |     name="table_1",
 84 |     columns=[
 85 |         Column(
 86 |             id_=0,
 87 |             column_name="col_1",
 88 |             column_type="text",
 89 |             is_primary_key=True,
 90 |             is_foreign_key=False,
 91 |         ),
 92 |         Column(
 93 |             id_=1,
 94 |             column_name="col_2",
 95 |             column_type="number",
 96 |             is_primary_key=False,
 97 |             is_foreign_key=False,
 98 |         ),
 99 |     ],
100 | )
101 | 
102 | 
103 | @mock.patch(
104 |     "semantic_model_generator.snowflake_utils.snowflake_connector.snowflake_connection"
105 | )
106 | def test_connect(
107 |     mock_snowflake_connection: mock.MagicMock, mock_snowflake_connection_env
108 | ):
109 |     mock_snowflake_connection.return_value = mock.MagicMock()
110 | 
111 |     connector = snowflake_connector.SnowflakeConnector(account_name="test_account")
112 |     with connector.connect(db_name="test") as conn:
113 |         pass
114 | 
115 |     conn.cursor().execute.assert_has_calls(
116 |         [
117 |             call("ALTER SESSION SET QUERY_TAG = 'SEMANTIC_MODEL_GENERATOR'"),
118 |             call("ALTER SESSION SET STATEMENT_TIMEOUT_IN_SECONDS = 120"),
119 |         ]
120 |     )
121 |     conn.close.assert_called_with()
122 | 
123 | 
124 | @mock.patch(
125 |     "semantic_model_generator.snowflake_utils.snowflake_connector.snowflake_connection"
126 | )
127 | def test_connect_with_schema(
128 |     mock_snowflake_connection: mock.MagicMock, mock_snowflake_connection_env
129 | ):
130 |     mock_snowflake_connection.return_value = mock.MagicMock()
131 | 
132 |     connector = snowflake_connector.SnowflakeConnector(
133 |         account_name="test_account",
134 |     )
135 |     with connector.connect(db_name="test_db", schema_name="test_schema") as conn:
136 |         pass
137 | 
138 |     conn.cursor().execute.assert_has_calls(
139 |         [
140 |             call("ALTER SESSION SET QUERY_TAG = 'SEMANTIC_MODEL_GENERATOR'"),
141 |             call("ALTER SESSION SET STATEMENT_TIMEOUT_IN_SECONDS = 120"),
142 |         ]
143 |     )
144 |     conn.close.assert_called_with()
145 | 
146 | 
147 | @mock.patch(
148 |     "semantic_model_generator.snowflake_utils.snowflake_connector._fetch_valid_tables_and_views"
149 | )
150 | @mock.patch(
151 |     "semantic_model_generator.snowflake_utils.snowflake_connector.snowflake_connection"
152 | )
153 | def test_get_valid_schema_table_columns_df(
154 |     mock_snowflake_connection: mock.MagicMock,
155 |     mock_valid_tables: mock.MagicMock,
156 |     valid_tables: pd.DataFrame,
157 |     schemas_tables_columns: pd.DataFrame,
158 | ):
159 |     mock_conn = mock.MagicMock()
160 |     # We expect get_database_representation() to execute queries in this order:
161 |     # - select from information_schema.tables
162 |     # - select from information_schema.columns for each table.
163 |     mock_conn.cursor().execute().fetch_pandas_all.side_effect = [
164 |         schemas_tables_columns[schemas_tables_columns["TABLE_NAME"] == "table_1"]
165 |     ]
166 |     mock_snowflake_connection.return_value = mock_conn
167 |     mock_valid_tables.return_value = valid_tables
168 | 
169 |     got = snowflake_connector.get_valid_schemas_tables_columns_df(
170 |         mock_conn, "TEST_DB", "TEST_SCHEMA_1", ["table_1"]
171 |     )
172 | 
173 |     want_data = {
174 |         "TABLE_SCHEMA": ["TEST_SCHEMA_1", "TEST_SCHEMA_1"],
175 |         "TABLE_NAME": ["table_1", "table_1"],
176 |         "TABLE_COMMENT": [None, None],
177 |         "COLUMN_NAME": ["col_1", "col_2"],
178 |         "DATA_TYPE": ["VARCHAR", "NUMBER"],
179 |         "COLUMN_COMMENT": [None, None],
180 |     }
181 | 
182 |     # Create a DataFrame
183 |     want = pd.DataFrame(want_data)
184 | 
185 |     assert_frame_equal(want, got)
186 | 
187 |     # Assert that the connection executed the expected queries.
188 |     query = "select t.TABLE_SCHEMA, t.TABLE_NAME, c.COLUMN_NAME, c.DATA_TYPE, c.COMMENT as COLUMN_COMMENT\nfrom TEST_DB.information_schema.tables as t\njoin TEST_DB.information_schema.columns as c on t.table_schema = c.table_schema and t.table_name = c.table_name where t.table_schema ilike 'TEST_SCHEMA_1' AND LOWER(t.table_name) in ('table_1') \norder by 1, 2, c.ordinal_position"
189 |     mock_conn.cursor().execute.assert_any_call(query)
190 | 
191 | 
192 | @pytest.fixture
193 | def snowflake_data():
194 |     return [
195 |         # This mimics the return value of cursor.fetchall() for tables and views
196 |         ([("table1", "schema1", "A table comment")], [("column1", "dtype")]),
197 |         ([("view1", "schema1", "A view comment")], [("column1", "dtype")]),
198 |     ]
199 | 
200 | 
201 | @pytest.fixture
202 | def expected_df():
203 |     # Expected DataFrame structure based on mocked fetchall data
204 |     return pd.DataFrame(
205 |         {
206 |             snowflake_connector._TABLE_NAME_COL: ["table1", "view1"],
207 |             snowflake_connector._TABLE_SCHEMA_COL: ["schema1", "schema1"],
208 |             snowflake_connector._TABLE_COMMENT_COL: [
209 |                 "A table comment",
210 |                 "A view comment",
211 |             ],
212 |         }
213 |     )
214 | 
215 | 
216 | def test_fetch_valid_tables_and_views(snowflake_data, expected_df):
217 |     # Mock SnowflakeConnection and cursor
218 |     mock_conn = mock.MagicMock()
219 |     mock_cursor = mock_conn.cursor.return_value
220 |     mock_cursor.execute.return_value = mock_cursor
221 |     # Set side effects for fetchall and description based on snowflake_data fixture
222 |     mock_cursor.fetchall.side_effect = [snowflake_data[0][0], snowflake_data[1][0]]
223 | 
224 |     mock_name_one = MagicMock()
225 |     mock_name_one.name = "name"
226 |     mock_name_two = MagicMock()
227 |     mock_name_two.name = "schema_name"
228 |     mock_name_three = MagicMock()
229 |     mock_name_three.name = "comment"
230 | 
231 |     mocked_descriptions = [mock_name_one, mock_name_two, mock_name_three]
232 |     mock_cursor.description = mocked_descriptions
233 | 
234 |     # Call the function to test
235 |     result_df = snowflake_connector._fetch_valid_tables_and_views(mock_conn, "mock_db")
236 | 
237 |     # Assert the result is as expected
238 |     pd.testing.assert_frame_equal(
239 |         result_df.reset_index(drop=True), expected_df.reset_index(drop=True)
240 |     )
241 | 
242 |     # Verify execute was called with correct queries
243 |     mock_cursor.execute.assert_any_call("show tables in database mock_db")
244 |     mock_cursor.execute.assert_any_call("show views in database mock_db")
245 | 


--------------------------------------------------------------------------------
/semantic_model_generator/tests/utils_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | 
 3 | from semantic_model_generator.data_processing.data_types import FQNParts
 4 | from semantic_model_generator.snowflake_utils.utils import create_fqn_table
 5 | 
 6 | 
 7 | def test_fqn_creation():
 8 |     input_name = "database.schema.table"
 9 | 
10 |     fqn_parts = create_fqn_table(input_name)
11 | 
12 |     assert fqn_parts == FQNParts(
13 |         database="DATABASE", schema_name="SCHEMA", table="table"
14 |     )
15 | 
16 | 
17 | def test_fqn_creation_invalid_name():
18 |     input_name = "database.schema table"
19 |     with pytest.raises(ValueError):
20 |         create_fqn_table(input_name)
21 | 


--------------------------------------------------------------------------------
/semantic_model_generator/tests/validate_model_test.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from unittest.mock import MagicMock, patch
 3 | 
 4 | from snowflake.connector import SnowflakeConnection
 5 | 
 6 | from semantic_model_generator.validate_model import validate
 7 | 
 8 | 
 9 | @patch("semantic_model_generator.validate_model.send_message")
10 | def test_validate_success(mock_send_message):
11 |     # Mock the response from send_message to simulate a successful response
12 |     mock_send_message.return_value = {}
13 | 
14 |     # Call the validate function
15 |     conn = MagicMock(spec=SnowflakeConnection)
16 |     yaml_str = "valid_yaml_content"
17 |     result = validate(yaml_str, conn)
18 | 
19 |     assert result is None
20 | 
21 | 
22 | @patch("semantic_model_generator.validate_model.send_message")
23 | def test_validate_error(mock_send_message):
24 |     # Mock the response from send_message to simulate an error response
25 |     mock_send_message.return_value = {
26 |         "error": json.dumps(
27 |             {
28 |                 "message": "This YAML is missing a name. Please use https://github.com/Snowflake-Labs/semantic-model-generator.*"
29 |             }
30 |         )
31 |     }
32 | 
33 |     # Call the validate function and assert that it raises a ValueError
34 |     conn = MagicMock(spec=SnowflakeConnection)
35 |     yaml_str = "invalid_yaml_content"
36 |     try:
37 |         validate(yaml_str, conn)
38 |     except ValueError as e:
39 |         # Verify that the error message is as expected
40 |         assert str(e) == "This YAML is missing a name."
41 | 


--------------------------------------------------------------------------------
/semantic_model_generator/tests/yaml_to_semantic_model_test.py:
--------------------------------------------------------------------------------
 1 | import pytest
 2 | from strictyaml import YAMLValidationError
 3 | 
 4 | from semantic_model_generator.data_processing.proto_utils import yaml_to_semantic_model
 5 | 
 6 | 
 7 | def test_valid_yaml():
 8 |     yaml = """
 9 | name: jaffle_shop
10 | tables:
11 |   - name: orders
12 |     description: Order overview data mart, offering key details for each order including
13 |       if it's a customer's first order and a food vs. drink item breakdown. One row
14 |       per order.
15 |     base_table:
16 |       database: autosql_dataset_dbt_jaffle_shop
17 |       schema: data
18 |       table: orders
19 |     filters:
20 |       - name: large_order
21 |         expr: cogs > 100
22 |       - name: custom_filter
23 |         expr: my_udf(col1, col2)
24 |       - name: window_func
25 |         expr: COUNT(i) OVER (PARTITION BY p ORDER BY o) count_i_Range_Pre
26 | """
27 |     assert yaml_to_semantic_model(yaml) is not None
28 | 
29 | 
30 | def test_invalid_sql():
31 |     yaml = """
32 | name: jaffle_shop
33 | tables:
34 |   - name: orders
35 |     description: Order overview data mart, offering key details for each order including
36 |       if it's a customer's first order and a food vs. drink item breakdown. One row
37 |       per order.
38 |     base_table:
39 |       database: autosql_dataset_dbt_jaffle_shop
40 |       schema: data
41 |       table: orders
42 |     filters:
43 |       - name: large_order
44 |         expr: (cogs > 100
45 | """
46 |     with pytest.raises(YAMLValidationError, match=r".*invalid SQL expression.*"):
47 |         yaml_to_semantic_model(yaml)
48 | 
49 | 
50 | def test_required_field_missing():
51 |     yaml = """
52 | name: jaffle_shop
53 | tables:
54 |   - name: orders
55 |     description: Order overview data mart, offering key details for each order including
56 |       if it's a customer's first order and a food vs. drink item breakdown. One row
57 |       per order.
58 |     base_table:
59 |       database: autosql_dataset_dbt_jaffle_shop
60 |       schema: data
61 | """
62 |     with pytest.raises(
63 |         YAMLValidationError, match=r".*required key.*table.*not found.*"
64 |     ):
65 |         yaml_to_semantic_model(yaml)
66 | 
67 | 
68 | def test_non_string_sample_value():
69 |     yaml = """
70 | name: jaffle_shop
71 | tables:
72 |   - name: orders
73 |     description: Order overview data mart, offering key details for each order including
74 |       if it's a customer's first order and a food vs. drink item breakdown. One row
75 |       per order.
76 |     base_table:
77 |       database: autosql_dataset_dbt_jaffle_shop
78 |       schema: data
79 |       table: orders
80 |     columns:
81 |       - name: order_id
82 |         expr: order_id
83 |         data_type: TEXT
84 |         kind: dimension
85 |         unique: true
86 |         sample_values:
87 |           - yes
88 |           - 1
89 |           - 05-17-2024
90 | """
91 |     ctx = yaml_to_semantic_model(yaml)
92 |     for sample_value in ctx.tables[0].columns[0].sample_values:
93 |         assert isinstance(sample_value, str)
94 | 


--------------------------------------------------------------------------------
/semantic_model_generator/validate/context_length.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | from typing import Any, TypeVar
  3 | 
  4 | from google.protobuf.message import Message
  5 | from loguru import logger
  6 | 
  7 | from semantic_model_generator.data_processing.proto_utils import proto_to_yaml
  8 | from semantic_model_generator.protos import semantic_model_pb2
  9 | 
 10 | # Max number of sample values we include in the semantic model representation.
 11 | _MAX_SAMPLE_VALUES = 3
 12 | 
 13 | ProtoMsg = TypeVar("ProtoMsg", bound=Message)
 14 | 
 15 | # Max total tokens is 32800.
 16 | # We reserve 500 tokens for response (average response is 300 tokens).
 17 | # So the prompt token limit is 32300.
 18 | # We reserve 1220 tokens for model instructions, separate from the semantic model.
 19 | # Thus, the semantic model will get about 31080 tokens,
 20 | # with some more discounting for retrieved literals.
 21 | _TOTAL_PROMPT_TOKEN_LIMIT = 32300
 22 | _BASE_INSTRUCTION_TOKEN_LENGTH = 1220
 23 | #  Estimated 10 tokens per literals since each literal is presented as a filter expression
 24 | #  (i.e. table.column = 'literal').
 25 | #  Currently 10 literals are retrieved per search.
 26 | _TOKENS_PER_LITERAL = 10
 27 | _NUM_LITERAL_RETRIEVALS = 10
 28 | 
 29 | # As per https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
 30 | _CHARS_PER_TOKEN = 4
 31 | 
 32 | 
 33 | def _get_field(msg: ProtoMsg, field_name: str) -> Any:
 34 |     fields = [value for fd, value in msg.ListFields() if fd.name == field_name]
 35 |     if not fields:
 36 |         return None
 37 |     return fields[0]
 38 | 
 39 | 
 40 | def _count_search_services(model: ProtoMsg) -> int:
 41 |     cnt = 0
 42 |     tables = _get_field(model, "tables")
 43 |     if not tables:
 44 |         return 0
 45 | 
 46 |     for table in tables:
 47 |         dimensions = _get_field(table, "dimensions")
 48 |         if not dimensions:
 49 |             continue
 50 |         for dimension in dimensions:
 51 |             if _get_field(dimension, "cortex_search_service_name"):
 52 |                 cnt += 1
 53 |     return cnt
 54 | 
 55 | 
 56 | def validate_context_length(
 57 |     model_orig: semantic_model_pb2.SemanticModel, throw_error: bool = False
 58 | ) -> None:
 59 |     """
 60 |     Validate the token limit for the model with space for the prompt.
 61 | 
 62 |     yaml_model: The yaml semantic model
 63 |     throw_error: Should this function throw an error or just a warning.
 64 |     """
 65 |     # When counting tokens, we need to remove the verified_queries field and additional sample values. Make a copy for counting.
 66 |     model = copy.deepcopy(model_orig)
 67 |     model.ClearField("verified_queries")
 68 |     # Also clear all the dimensional sample values, as we'll retrieve those into filters by default.
 69 |     for t in model.tables:
 70 |         for dim in t.dimensions:
 71 |             del dim.sample_values[_MAX_SAMPLE_VALUES:]
 72 | 
 73 |     num_search_services = _count_search_services(model)
 74 | 
 75 |     yaml_str = proto_to_yaml(model)
 76 |     # Pass in the str version of the semantic context yaml.
 77 |     # This isn't exactly how many tokens the model will be, but should roughly be correct.
 78 |     literals_buffer = (
 79 |         _TOKENS_PER_LITERAL * _NUM_LITERAL_RETRIEVALS * (1 + num_search_services)
 80 |     )
 81 |     approx_instruction_length = _BASE_INSTRUCTION_TOKEN_LENGTH + literals_buffer
 82 |     model_tokens_limit = _TOTAL_PROMPT_TOKEN_LIMIT - approx_instruction_length
 83 |     model_tokens = len(yaml_str) // _CHARS_PER_TOKEN
 84 |     if model_tokens > model_tokens_limit:
 85 |         tokens_to_remove = model_tokens - model_tokens_limit
 86 |         chars_to_remove = tokens_to_remove * _CHARS_PER_TOKEN
 87 |         if throw_error:
 88 |             raise ValueError(
 89 |                 f"Your semantic model is too large. "
 90 |                 f"Passed size is {len(yaml_str)} characters. "
 91 |                 f"We need you to remove {chars_to_remove} characters in your semantic model. "
 92 |                 f"Please check: \n"
 93 |                 f" (1) If you have long descriptions that can be truncated. \n"
 94 |                 f" (2) If you can remove some columns that are not used within your tables. \n"
 95 |                 f" (3) If you have extra tables you do not need."
 96 |             )
 97 |         else:
 98 |             logger.warning(
 99 |                 f"WARNING 🚨: The Semantic model is too large. \n"
100 |                 f"Passed size is {len(yaml_str)} characters. "
101 |                 f"We need you to remove {chars_to_remove} characters in your semantic model. "
102 |                 f"Please check: \n"
103 |                 f" (1) If you have long descriptions that can be truncated. \n"
104 |                 f" (2) If you can remove some columns that are not used within your tables. \n"
105 |                 f" (3) If you have extra tables you do not need. \n"
106 |                 f" Once you've finished updating, please validate your semantic model."
107 |             )
108 | 


--------------------------------------------------------------------------------
/semantic_model_generator/validate/keywords.py:
--------------------------------------------------------------------------------
 1 | # https://docs.snowflake.com/en/sql-reference/reserved-keywords
 2 | SF_RESERVED_WORDS = {
 3 |     "ACCOUNT",
 4 |     "ALL",
 5 |     "ALTER",
 6 |     "AND",
 7 |     "ANY",
 8 |     "AS",
 9 |     "BETWEEN",
10 |     "BY",
11 |     "CASE",
12 |     "CAST",
13 |     "CHECK",
14 |     "COLUMN",
15 |     "CONNECT",
16 |     "CONNECTION",
17 |     "CONSTRAINT",
18 |     "CREATE",
19 |     "CROSS",
20 |     "CURRENT",
21 |     "CURRENT_DATE",
22 |     "CURRENT_TIME",
23 |     "CURRENT_TIMESTAMP",
24 |     "CURRENT_USER",
25 |     "DATABASE",
26 |     "DELETE",
27 |     "DISTINCT",
28 |     "DROP",
29 |     "ELSE",
30 |     "EXISTS",
31 |     "FALSE",
32 |     "FOLLOWING",
33 |     "FOR",
34 |     "FROM",
35 |     "FULL",
36 |     "GRANT",
37 |     "GROUP",
38 |     "GSCLUSTER",
39 |     "HAVING",
40 |     "ILIKE",
41 |     "IN",
42 |     "INCREMENT",
43 |     "INNER",
44 |     "INSERT",
45 |     "INTERSECT",
46 |     "INTO",
47 |     "IS",
48 |     "ISSUE",
49 |     "JOIN",
50 |     "LATERAL",
51 |     "LEFT",
52 |     "LIKE",
53 |     "LOCALTIME",
54 |     "LOCALTIMESTAMP",
55 |     "MINUS",
56 |     "NATURAL",
57 |     "NOT",
58 |     "NULL",
59 |     "OF",
60 |     "ON",
61 |     "OR",
62 |     "ORDER",
63 |     "ORGANIZATION",
64 |     "QUALIFY",
65 |     "REGEXP",
66 |     "REVOKE",
67 |     "RIGHT",
68 |     "RLIKE",
69 |     "ROW",
70 |     "ROWS",
71 |     "SAMPLE",
72 |     "SCHEMA",
73 |     "SELECT",
74 |     "SET",
75 |     "SOME",
76 |     "START",
77 |     "TABLE",
78 |     "TABLESAMPLE",
79 |     "THEN",
80 |     "TO",
81 |     "TRIGGER",
82 |     "TRUE",
83 |     "TRY_CAST",
84 |     "UNION",
85 |     "UNIQUE",
86 |     "UPDATE",
87 |     "USING",
88 |     "VALUES",
89 |     "VIEW",
90 |     "WHEN",
91 |     "WHENEVER",
92 |     "WHERE",
93 |     "WITH",
94 | }
95 | 


--------------------------------------------------------------------------------
/semantic_model_generator/validate/schema.py:
--------------------------------------------------------------------------------
  1 | # This file is essentially doing DFS in the protobuf Descriptors and storing in the SCHEMA. We start with as the root
  2 | # SemanticModel at the bottom of this file. This will automatically pickup any changes to the protobuf (given you run
  3 | # the protoc command before to regenerate the python files. Different proto messages can have the same message type as a
  4 | # child, so we keep a dict of precomputed types to avoid double computing. This currently does not support cycles in the
  5 | # proto definition, but we can add a visited set to this if we ever need to.
  6 | 
  7 | 
  8 | from typing import Dict
  9 | 
 10 | import sqlglot
 11 | from google.protobuf.descriptor import Descriptor, EnumDescriptor, FieldDescriptor
 12 | from strictyaml import (
 13 |     Bool,
 14 |     Decimal,
 15 |     Enum,
 16 |     Int,
 17 |     Map,
 18 |     Optional,
 19 |     Seq,
 20 |     Str,
 21 |     Validator,
 22 |     YAMLValidationError,
 23 | )
 24 | 
 25 | from semantic_model_generator.protos import semantic_model_pb2
 26 | from semantic_model_generator.validate.keywords import SF_RESERVED_WORDS
 27 | 
 28 | scalar_type_map = {
 29 |     FieldDescriptor.TYPE_BOOL: Bool,
 30 |     FieldDescriptor.TYPE_STRING: Str,
 31 |     FieldDescriptor.TYPE_DOUBLE: Decimal,
 32 |     FieldDescriptor.TYPE_FLOAT: Decimal,
 33 |     FieldDescriptor.TYPE_INT32: Int,
 34 |     FieldDescriptor.TYPE_INT64: Int,
 35 | }
 36 | 
 37 | 
 38 | class SqlExpression(Str):  # type: ignore
 39 |     def validate_scalar(self, chunk):  # type: ignore
 40 |         try:
 41 |             sqlglot.parse_one(chunk.contents, dialect=sqlglot.dialects.Snowflake)  # type: ignore
 42 |         except Exception:
 43 |             chunk.expecting_but_found("", "invalid SQL expression")
 44 |         return chunk.contents
 45 | 
 46 | 
 47 | class IdField(Str):  # type: ignore
 48 |     def validate_scalar(self, chunk):  # type: ignore
 49 |         if not chunk.contents.replace("_", "").replace("$", "").isalnum():
 50 |             chunk.expecting_but_found(
 51 |                 "",
 52 |                 "name can only contain letters, underscores, decimal digits (0-9), and dollar signs ($).",
 53 |             )
 54 |         if chunk.contents.upper() in SF_RESERVED_WORDS:
 55 |             chunk.expecting_but_found("", "name cannot be a Snowflake reserved keyword")
 56 |         return chunk.contents
 57 | 
 58 | 
 59 | class VerifiedQueries(Seq):  # type: ignore
 60 |     """
 61 |     Validator for the verified_queries field.
 62 |     We ensure that there are no duplicate verified queries, by checking for duplicate (question, sql) pairs.
 63 |     """
 64 | 
 65 |     def validate(self, chunk):  # type: ignore
 66 |         super().validate(chunk)
 67 |         seen_queries = set()
 68 |         for query in chunk.contents:
 69 |             qa_pair = (query["question"], query["sql"])
 70 |             if qa_pair in seen_queries:
 71 |                 raise YAMLValidationError(
 72 |                     context="Duplicate verified query found.",
 73 |                     problem=query["name"],
 74 |                     chunk=chunk,
 75 |                 )
 76 |             seen_queries.add(qa_pair)
 77 | 
 78 | 
 79 | def create_schema_for_message(
 80 |     message: Descriptor, precomputed_types: Dict[str, Validator]
 81 | ) -> Validator:
 82 |     if message.name in precomputed_types:
 83 |         return precomputed_types[message.name]
 84 |     message_schema = {}
 85 |     for k, v in message.fields_by_name.items():
 86 |         if _is_optional_field(v):
 87 |             message_schema[Optional(k)] = create_schema_for_field(v, precomputed_types)
 88 |         else:
 89 |             message_schema[k] = create_schema_for_field(v, precomputed_types)
 90 |     schema = Map(message_schema)
 91 |     precomputed_types[message.name] = schema
 92 |     return schema
 93 | 
 94 | 
 95 | def create_schema_for_field(
 96 |     field_descriptor: FieldDescriptor, precomputed_types: Dict[str, Validator]
 97 | ) -> Validator:
 98 |     if field_descriptor.type == FieldDescriptor.TYPE_MESSAGE:
 99 |         field_type = create_schema_for_message(
100 |             field_descriptor.message_type, precomputed_types
101 |         )
102 |     elif field_descriptor.type == FieldDescriptor.TYPE_ENUM:
103 |         field_type = create_schema_for_enum(
104 |             field_descriptor.enum_type, precomputed_types
105 |         )
106 |     elif field_descriptor.type == FieldDescriptor.TYPE_STRING and _is_sql_expression(
107 |         field_descriptor
108 |     ):
109 |         field_type = SqlExpression()
110 |     elif field_descriptor.type == FieldDescriptor.TYPE_STRING and _is_id_field(
111 |         field_descriptor
112 |     ):
113 |         field_type = IdField()
114 |     elif field_descriptor.type in scalar_type_map:
115 |         field_type = scalar_type_map[field_descriptor.type]()
116 |     else:
117 |         raise Exception(f"unsupported type: {field_descriptor.type}")
118 | 
119 |     if field_descriptor.label == FieldDescriptor.LABEL_REPEATED:
120 |         if field_descriptor.name == "verified_queries":
121 |             field_type = VerifiedQueries(field_type)
122 |         else:
123 |             field_type = Seq(field_type)
124 | 
125 |     return field_type
126 | 
127 | 
128 | def _is_optional_field(field_descriptor: FieldDescriptor) -> bool:
129 |     return _has_field_option(field_descriptor, "optional")
130 | 
131 | 
132 | def _is_sql_expression(field_descriptor: FieldDescriptor) -> bool:
133 |     return _has_field_option(field_descriptor, "sql_expression")
134 | 
135 | 
136 | def _is_id_field(field_descriptor: FieldDescriptor) -> bool:
137 |     return _has_field_option(field_descriptor, "id_field")
138 | 
139 | 
140 | def _has_field_option(field_descriptor: FieldDescriptor, option_name: str) -> bool:
141 |     option = list(
142 |         filter(
143 |             lambda o: o[0].name == option_name,
144 |             field_descriptor.GetOptions().ListFields(),
145 |         )
146 |     )
147 |     # ListFields returns a list of (FieldDescriptor, value) tuples. This checks that the given option is present
148 |     #  and that its value is True
149 |     return len(option) > 0 and option[0][1]
150 | 
151 | 
152 | def create_schema_for_enum(
153 |     enum: EnumDescriptor, precomputed_types: Dict[str, Validator]
154 | ) -> Validator:
155 |     if enum.name in precomputed_types:
156 |         return precomputed_types[enum.name]
157 |     schema = Enum([v.name for v in enum.values])
158 |     precomputed_types[enum.name] = schema
159 |     return schema
160 | 
161 | 
162 | SCHEMA = create_schema_for_message(semantic_model_pb2.SemanticModel.DESCRIPTOR, {})
163 | 


--------------------------------------------------------------------------------
/semantic_model_generator/validate_model.py:
--------------------------------------------------------------------------------
 1 | from snowflake.connector import SnowflakeConnection
 2 | 
 3 | from app_utils.chat import send_message
 4 | 
 5 | 
 6 | def load_yaml(yaml_path: str) -> str:
 7 |     """
 8 |     Load local yaml file into str.
 9 | 
10 |     yaml_path: str The absolute path to the location of your yaml file. Something like path/to/your/file.yaml.
11 |     """
12 |     with open(yaml_path) as f:
13 |         yaml_str = f.read()
14 |     return yaml_str
15 | 
16 | 
17 | def validate(yaml_str: str, conn: SnowflakeConnection) -> None:
18 |     """
19 |     We perform pseudo-validation by issuing a request to Cortex Analyst with the YAML string as-is, and determining
20 |     whether the request is successful. We don't currently have an explicit validation endpoint available, but validation
21 |     is run at inference time, so this is a reasonable proxy.
22 | 
23 |     This is done in order to remove the need to sync validation logic locally between these codepaths and Analyst.
24 | 
25 |     yaml_str: yaml content in string format.
26 |     conn: SnowflakeConnection Snowflake connection to pass in
27 |     """
28 | 
29 |     dummy_request = [
30 |         {"role": "user", "content": [{"type": "text", "text": "SMG app validation"}]}
31 |     ]
32 |     send_message(conn, yaml_str, dummy_request)
33 | 
34 | 
35 | def validate_from_local_path(yaml_path: str, conn: SnowflakeConnection) -> None:
36 |     yaml_str = load_yaml(yaml_path)
37 |     validate(yaml_str, conn)
38 | 


--------------------------------------------------------------------------------
/sis_setup/app_setup.sql:
--------------------------------------------------------------------------------
  1 | SET (streamlit_warehouse)=(SELECT CURRENT_WAREHOUSE());
  2 | 
  3 | CREATE DATABASE IF NOT EXISTS CORTEX_ANALYST_SEMANTICS
  4 | COMMENT = '{"origin": "sf_sit",
  5 |             "name": "skimantics",
  6 |             "version": {"major": 2, "minor": 0},
  7 |             "attributes": {"deployment": "sis"}}';
  8 | 
  9 | CREATE SCHEMA IF NOT EXISTS CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR
 10 | COMMENT = '{"origin": "sf_sit",
 11 |             "name": "skimantics",
 12 |             "version": {"major": 2, "minor": 0},
 13 |             "attributes": {"deployment": "sis"}}';
 14 | 
 15 | -- Create stage for App logic and 3rd party packages
 16 | CREATE OR REPLACE STAGE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE
 17 | DIRECTORY = (ENABLE = true)
 18 | COMMENT = '{"origin": "sf_sit",
 19 |             "name": "skimantics",
 20 |             "version": {"major": 2, "minor": 0},
 21 |             "attributes": {"deployment": "sis"}}';
 22 | 
 23 | -- Upload 3rd party packages
 24 | -- Run from sis_setup/ as paths are relative to this directory
 25 | PUT file://app_utils/*.zip @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 26 | 
 27 | -- Upload App logic
 28 | PUT file://app.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 29 | PUT file://environment.yml @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 30 | PUT file://semantic_model_generator/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 31 | PUT file://semantic_model_generator/data_processing/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/data_processing/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 32 | PUT file://semantic_model_generator/protos/*.p* @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/protos/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 33 | PUT file://semantic_model_generator/snowflake_utils/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/snowflake_utils/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 34 | PUT file://semantic_model_generator/validate/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/validate/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 35 | PUT file://images/*.png @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/images/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 36 | PUT file://journeys/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/journeys/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 37 | PUT file://partner/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/partner/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 38 | PUT file://app_utils/*.py @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/app_utils/ OVERWRITE = TRUE AUTO_COMPRESS = FALSE;
 39 | 
 40 | -- Create Streamlit
 41 | CREATE OR REPLACE STREAMLIT CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.SEMANTIC_MODEL_GENERATOR
 42 | ROOT_LOCATION = '@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator'
 43 | MAIN_FILE = 'app.py'
 44 | TITLE = "Semantic Model Generator"
 45 | IMPORTS = ('@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/looker_sdk.zip',
 46 | '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip')
 47 | QUERY_WAREHOUSE = $streamlit_warehouse
 48 | COMMENT = '{"origin": "sf_sit",
 49 |             "name": "skimantics",
 50 |             "version": {"major": 2, "minor": 0},
 51 |             "attributes": {"deployment": "sis"}}';
 52 | 
 53 | 
 54 | -- Create Semantic Model Generation Callable
 55 | -- Zip src files for callable SPROC for generation
 56 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES(
 57 |     database STRING,
 58 |     schema STRING,
 59 |     stage STRING,
 60 |     source_path STRING,
 61 |     target_parent STRING,
 62 |     zip_filename STRING
 63 | )
 64 | RETURNS VARCHAR
 65 | LANGUAGE PYTHON
 66 | RUNTIME_VERSION = 3.10
 67 | PACKAGES = (
 68 |     'snowflake-snowpark-python==1.18.0'
 69 | )
 70 | HANDLER='zip_staged_files'
 71 | EXECUTE AS CALLER
 72 | AS $$
 73 | from snowflake.snowpark import Session
 74 | from typing import Optional
 75 | 
 76 | def get_staged_files(session: Session,
 77 |                      database: str,
 78 |                      schema: str,
 79 |                      stage: str,
 80 |                      target_parent: Optional[str] = None,
 81 |                      source_path: Optional[str] = None,
 82 |                      ) -> dict[str, str]:
 83 |     
 84 |     query = f"ls @{database}.{schema}.{stage}/{source_path}"
 85 |     file_result = session.sql(query).collect()
 86 | 
 87 |     file_data = {}
 88 |     for row in file_result:
 89 |         filename = row['name'].split('/',1)[1] # Remove the stage name from the filename
 90 | 
 91 |         # If target_parent is provided, replace the original file pathing with it
 92 |         if target_parent:
 93 |             filename = filename.replace(source_path, f"{target_parent}")
 94 | 
 95 |         full_file_path = f"@{database}.{schema}.{row['name']}"
 96 |         file_data[filename] = session.file.get_stream(f"{full_file_path}").read().decode('utf-8')
 97 | 
 98 |     return file_data
 99 | 
100 | def create_zip(file_data: dict[str, str]) -> bytes:
101 |     import io
102 |     import zipfile
103 | 
104 |     zip_buffer = io.BytesIO()
105 | 
106 |     with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED
107 |                          ) as zipf:
108 |         for filename, content in file_data.items():
109 |             zipf.writestr(filename, content)
110 | 
111 |     zip_bytes = zip_buffer.getvalue()
112 | 
113 |     return zip_bytes
114 | 
115 | def upload_zip(session: Session,
116 |                database: str,
117 |                schema: str,
118 |                stage: str,
119 |                zip_file: bytes,
120 |                zip_filename: str,
121 |                ) -> None:
122 |     import io
123 | 
124 |     session.file.put_stream(
125 |                 io.BytesIO(zip_file),
126 |                 f"@{database}.{schema}.{stage}/{zip_filename.replace('zip','')}.zip",
127 |                 auto_compress=False,
128 |                 overwrite=True,
129 |             )
130 |     
131 | def zip_staged_files(session: Session,
132 |                      database: str,
133 |                      schema: str,
134 |                      stage: str,
135 |                      source_path: Optional[str] = None,
136 |                      target_parent: Optional[str] = None,
137 |                      zip_filename: Optional[str] = None,
138 |                      ) -> str:
139 |     
140 |     file_data = get_staged_files(session, database, schema, stage, target_parent, source_path)
141 |     zip_file = create_zip(file_data)
142 | 
143 |     if zip_filename:
144 |         zip_filename = zip_filename
145 |     elif target_parent is not None:
146 |         zip_filename = target_parent
147 |     elif source_path is not None:
148 |         zip_filename = source_path
149 |     else:
150 |         zip_filename = "zipped_files"
151 | 
152 |     upload_zip(session, database, schema, stage, zip_file, zip_filename)
153 | 
154 |     return f"Files zipped and uploaded to {database}.{schema}.{stage}/{zip_filename}.zip."
155 | 
156 | $$;
157 | 
158 | CALL CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES(
159 |     'CORTEX_ANALYST_SEMANTICS',
160 |     'SEMANTIC_MODEL_GENERATOR',
161 |     'streamlit_stage',
162 |     'semantic_model_generator/semantic_model_generator',
163 |     'semantic_model_generator',
164 |     'semantic_model_generator'
165 | );
166 | 
167 | -- Create generation callable
168 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.GENERATE_SEMANTIC_FILE(
169 |     STAGE_NAME STRING,
170 |     MODEL_NAME STRING,
171 |     SAMPLE_VALUE INT,
172 |     ALLOW_JOINS BOOLEAN,
173 |     TABLE_LIST ARRAY
174 | )
175 | RETURNS VARCHAR
176 | LANGUAGE PYTHON
177 | RUNTIME_VERSION = 3.10
178 | PACKAGES = (
179 |     'pandas==2.2.2',
180 |     'tqdm==4.66.5',
181 |     'loguru==0.5.3',
182 |     'protobuf==3.20.3',
183 |     'pydantic==2.8.2',
184 |     'pyyaml==6.0.1',
185 |     'ruamel.yaml==0.17.21',
186 |     'pyarrow==14.0.2',
187 |     'sqlglot==25.10.0',
188 |     'numpy==1.26.4',
189 |     'python-dotenv==0.21.0',
190 |     'urllib3==2.2.2',
191 |     'types-pyyaml==6.0.12.12',
192 |     'types-protobuf==4.25.0.20240417',
193 |     'snowflake-snowpark-python==1.18.0',
194 |     'cattrs==23.1.2',
195 |     'filelock'
196 | )
197 | IMPORTS = ('@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator.zip',
198 |     '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip'
199 |             )
200 | HANDLER='run_generation'
201 | EXECUTE AS CALLER
202 | AS $$
203 | from snowflake.snowpark import Session
204 | 
205 | def import_src_zip(zip_name = 'semantic_model_generator.zip'):
206 |     """Unpacks source zip file in stage to enable importing it to mirror source code structure."""
207 | 
208 |     import os
209 |     import sys
210 |     import zipfile
211 |     from filelock import FileLock
212 | 
213 |     # Get the location of the import directory. Snowflake sets the import
214 |     # directory location so code can retrieve the location via sys._xoptions.
215 |     IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
216 |     import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
217 | 
218 |     # Get the path to the ZIP file and set the location to extract to.
219 |     zip_file_path = import_dir + zip_name
220 |     extracted = f'/tmp/{zip_name.replace(".zip", "")}'
221 | 
222 |     # Extract the contents of the ZIP. This is done under the file lock
223 |     # to ensure that only one worker process unzips the contents.
224 |     with FileLock('/tmp/extract.lock'):
225 |         if not os.path.isdir(extracted):
226 |             with zipfile.ZipFile(zip_file_path, 'r') as myzip:
227 |                 myzip.extractall(extracted)
228 | 
229 |     # Add in front in case there are conflicting module names including original zipped file
230 |     sys.path.insert(0,extracted)
231 | 
232 | def run_generation(session: Session,
233 |                    STAGE_NAME: str,
234 |                    MODEL_NAME: str,
235 |                    SAMPLE_VALUE: int,
236 |                    ALLOW_JOINS: bool,
237 |                    TABLE_LIST: list[str]) -> str:
238 | 
239 |     import io
240 | 
241 |     import_src_zip()
242 |     from semantic_model_generator.generate_model import generate_model_str_from_snowflake
243 | 
244 |     if not MODEL_NAME:
245 |         raise ValueError("Please provide a name for your semantic model.")
246 |     elif not TABLE_LIST:
247 |         raise ValueError("Please select at least one table to proceed.")
248 |     else:
249 |         yaml_str = generate_model_str_from_snowflake(
250 |             base_tables=TABLE_LIST,
251 |             semantic_model_name=MODEL_NAME,
252 |             n_sample_values=SAMPLE_VALUE,  # type: ignore
253 |             conn=session.connection,
254 |             allow_joins=ALLOW_JOINS,
255 |         )
256 | 
257 |         session.file.put_stream(
258 |                 io.BytesIO(yaml_str.encode('utf-8')),
259 |                f"@{STAGE_NAME}/{MODEL_NAME}.yaml",
260 |                auto_compress=False,
261 |                overwrite=True,
262 |            )
263 |         return f"Semantic model file {MODEL_NAME}.yaml has been generated and saved to {STAGE_NAME}."
264 | $$;


--------------------------------------------------------------------------------
/sis_setup/looker_integration.sql:
--------------------------------------------------------------------------------
 1 | USE DATABASE CORTEX_ANALYST_SEMANTICS;
 2 | USE SCHEMA SEMANTIC_MODEL_GENERATOR;
 3 | 
 4 | CREATE OR REPLACE NETWORK RULE looker_rule
 5 |   MODE = EGRESS
 6 |   TYPE = HOST_PORT
 7 |   VALUE_LIST = ('<% looker_url %>');
 8 | 
 9 | CREATE OR REPLACE SECRET looker_client_secret
10 |   TYPE = GENERIC_STRING
11 |   SECRET_STRING = '<% client_secret %>';
12 | 
13 | CREATE OR REPLACE EXTERNAL ACCESS INTEGRATION looker_access_int
14 |   ALLOWED_NETWORK_RULES = (looker_rule)
15 |   ALLOWED_AUTHENTICATION_SECRETS = (looker_client_secret)
16 |   ENABLED = TRUE;
17 | 
18 | GRANT READ ON SECRET looker_client_secret TO ROLE <% streamlit_role %>;
19 | GRANT USAGE ON INTEGRATION looker_access_int TO ROLE <% streamlit_role %>;
20 | 
21 | USE ROLE <% streamlit_role %>;
22 | 
23 | ALTER STREAMLIT CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.SEMANTIC_MODEL_GENERATOR
24 |   SET EXTERNAL_ACCESS_INTEGRATIONS = (looker_access_int)
25 |   SECRETS = ('looker_client_secret' = CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.looker_client_secret);


--------------------------------------------------------------------------------
/sis_setup/sissetup_snowsightgit.sql:
--------------------------------------------------------------------------------
  1 | SET (streamlit_warehouse)=(SELECT CURRENT_WAREHOUSE());
  2 | 
  3 | CREATE DATABASE IF NOT EXISTS CORTEX_ANALYST_SEMANTICS;
  4 | USE DATABASE CORTEX_ANALYST_SEMANTICS;
  5 | 
  6 | -- Create API Integration for Git
  7 | CREATE OR REPLACE API INTEGRATION git_api_integration_snowflake_labs
  8 |   API_PROVIDER = git_https_api
  9 |   API_ALLOWED_PREFIXES = ('https://github.com/Snowflake-Labs')
 10 |   ENABLED = TRUE;
 11 | 
 12 | -- Create Git Repository
 13 | CREATE OR REPLACE GIT REPOSITORY git_snowflake_semantic_model_generator
 14 |   API_INTEGRATION = git_api_integration_snowflake_labs
 15 |   ORIGIN = 'https://github.com/Snowflake-Labs/semantic-model-generator.git';
 16 | 
 17 | ALTER GIT REPOSITORY git_snowflake_semantic_model_generator FETCH;
 18 | 
 19 | -- Create Schema to host streamlit app
 20 | CREATE SCHEMA IF NOT EXISTS CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR
 21 | COMMENT = '{"origin": "sf_sit",
 22 |             "name": "skimantics",
 23 |             "version": {"major": 2, "minor": 0},
 24 |             "attributes": {"deployment": "sis"}}';
 25 | 
 26 | -- Create stage for App logic and 3rd party packages
 27 | CREATE OR REPLACE STAGE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE
 28 | DIRECTORY = (ENABLE = true)
 29 | COMMENT = '{"origin": "sf_sit",
 30 |             "name": "skimantics",
 31 |             "version": {"major": 2, "minor": 0},
 32 |             "attributes": {"deployment": "sis"}}';
 33 | 
 34 | -- Copy Files from Git Repository into App Stage
 35 | COPY FILES
 36 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE
 37 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/app_utils/
 38 |   PATTERN='.*[.]zip';
 39 | 
 40 | COPY FILES
 41 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/
 42 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/
 43 |   FILES = ('environment.yml', 'app.py');
 44 | 
 45 | COPY FILES
 46 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/
 47 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/semantic_model_generator/
 48 |   PATTERN='.*[.]py';
 49 |   
 50 | RM @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/tests;
 51 | RM @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/semantic_model_generator/output_models;
 52 | 
 53 | COPY FILES
 54 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/images/
 55 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/images/
 56 |   PATTERN='.*[.]png';
 57 | 
 58 | COPY FILES
 59 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/journeys/
 60 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/journeys/
 61 |   PATTERN='.*[.]py';
 62 | 
 63 | COPY FILES
 64 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/partner/
 65 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/partner/
 66 |   PATTERN='.*[.]py';
 67 | 
 68 | COPY FILES
 69 |   INTO @CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator/app_utils/
 70 |   FROM @CORTEX_ANALYST_SEMANTICS.PUBLIC.git_snowflake_semantic_model_generator/branches/main/app_utils/
 71 |   PATTERN='.*[.]py';
 72 | 
 73 | -- Create Streamlit App
 74 | CREATE OR REPLACE STREAMLIT CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.SEMANTIC_MODEL_GENERATOR
 75 | ROOT_LOCATION = '@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator'
 76 | MAIN_FILE = 'app.py'
 77 | TITLE = "Semantic Model Generator"
 78 | IMPORTS = ('@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/looker_sdk.zip',
 79 | '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip')
 80 | QUERY_WAREHOUSE = $streamlit_warehouse
 81 | COMMENT = '{"origin": "sf_sit",
 82 |             "name": "skimantics",
 83 |             "version": {"major": 2, "minor": 0},
 84 |             "attributes": {"deployment": "sis"}}';
 85 | 
 86 | 
 87 | -- Create Semantic Model Generation Callable
 88 | -- Zip src files for callable SPROC for generation
 89 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES(
 90 |     database STRING,
 91 |     schema STRING,
 92 |     stage STRING,
 93 |     source_path STRING,
 94 |     target_parent STRING,
 95 |     zip_filename STRING
 96 | )
 97 | RETURNS VARCHAR
 98 | LANGUAGE PYTHON
 99 | RUNTIME_VERSION = 3.10
100 | PACKAGES = (
101 |     'snowflake-snowpark-python==1.18.0'
102 | )
103 | HANDLER='zip_staged_files'
104 | EXECUTE AS CALLER
105 | AS $$
106 | from snowflake.snowpark import Session
107 | from typing import Optional
108 | 
109 | def get_staged_files(session: Session,
110 |                      database: str,
111 |                      schema: str,
112 |                      stage: str,
113 |                      target_parent: Optional[str] = None,
114 |                      source_path: Optional[str] = None,
115 |                      ) -> dict[str, str]:
116 |     
117 |     query = f"ls @{database}.{schema}.{stage}/{source_path}"
118 |     file_result = session.sql(query).collect()
119 | 
120 |     file_data = {}
121 |     for row in file_result:
122 |         filename = row['name'].split('/',1)[1] # Remove the stage name from the filename
123 | 
124 |         # If target_parent is provided, replace the original file pathing with it
125 |         if target_parent:
126 |             filename = filename.replace(source_path, f"{target_parent}")
127 | 
128 |         full_file_path = f"@{database}.{schema}.{row['name']}"
129 |         file_data[filename] = session.file.get_stream(f"{full_file_path}").read().decode('utf-8')
130 | 
131 |     return file_data
132 | 
133 | def create_zip(file_data: dict[str, str]) -> bytes:
134 |     import io
135 |     import zipfile
136 | 
137 |     zip_buffer = io.BytesIO()
138 | 
139 |     with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED
140 |                          ) as zipf:
141 |         for filename, content in file_data.items():
142 |             zipf.writestr(filename, content)
143 | 
144 |     zip_bytes = zip_buffer.getvalue()
145 | 
146 |     return zip_bytes
147 | 
148 | def upload_zip(session: Session,
149 |                database: str,
150 |                schema: str,
151 |                stage: str,
152 |                zip_file: bytes,
153 |                zip_filename: str,
154 |                ) -> None:
155 |     import io
156 | 
157 |     session.file.put_stream(
158 |                 io.BytesIO(zip_file),
159 |                 f"@{database}.{schema}.{stage}/{zip_filename.replace('zip','')}.zip",
160 |                 auto_compress=False,
161 |                 overwrite=True,
162 |             )
163 |     
164 | def zip_staged_files(session: Session,
165 |                      database: str,
166 |                      schema: str,
167 |                      stage: str,
168 |                      source_path: Optional[str] = None,
169 |                      target_parent: Optional[str] = None,
170 |                      zip_filename: Optional[str] = None,
171 |                      ) -> str:
172 |     
173 |     file_data = get_staged_files(session, database, schema, stage, target_parent, source_path)
174 |     zip_file = create_zip(file_data)
175 | 
176 |     if zip_filename:
177 |         zip_filename = zip_filename
178 |     elif target_parent is not None:
179 |         zip_filename = target_parent
180 |     elif source_path is not None:
181 |         zip_filename = source_path
182 |     else:
183 |         zip_filename = "zipped_files"
184 | 
185 |     upload_zip(session, database, schema, stage, zip_file, zip_filename)
186 | 
187 |     return f"Files zipped and uploaded to {database}.{schema}.{stage}/{zip_filename}.zip."
188 | 
189 | $$;
190 | 
191 | CALL CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.ZIP_SRC_FILES(
192 |     'CORTEX_ANALYST_SEMANTICS',
193 |     'SEMANTIC_MODEL_GENERATOR',
194 |     'streamlit_stage',
195 |     'semantic_model_generator/semantic_model_generator',
196 |     'semantic_model_generator',
197 |     'semantic_model_generator'
198 | );
199 | 
200 | -- Create generation callable
201 | CREATE OR REPLACE PROCEDURE CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.GENERATE_SEMANTIC_FILE(
202 |     STAGE_NAME STRING,
203 |     MODEL_NAME STRING,
204 |     SAMPLE_VALUE INT,
205 |     ALLOW_JOINS BOOLEAN,
206 |     TABLE_LIST ARRAY
207 | )
208 | RETURNS VARCHAR
209 | LANGUAGE PYTHON
210 | RUNTIME_VERSION = 3.10
211 | PACKAGES = (
212 |     'pandas==2.2.2',
213 |     'tqdm==4.66.5',
214 |     'loguru==0.5.3',
215 |     'protobuf==3.20.3',
216 |     'pydantic==2.8.2',
217 |     'pyyaml==6.0.1',
218 |     'ruamel.yaml==0.17.21',
219 |     'pyarrow==14.0.2',
220 |     'sqlglot==25.10.0',
221 |     'numpy==1.26.4',
222 |     'python-dotenv==0.21.0',
223 |     'urllib3==2.2.2',
224 |     'types-pyyaml==6.0.12.12',
225 |     'types-protobuf==4.25.0.20240417',
226 |     'snowflake-snowpark-python==1.18.0',
227 |     'cattrs==23.1.2',
228 |     'filelock'
229 | )
230 | IMPORTS = ('@CORTEX_ANALYST_SEMANTICS.SEMANTIC_MODEL_GENERATOR.STREAMLIT_STAGE/semantic_model_generator.zip',
231 |     '@cortex_analyst_semantics.semantic_model_generator.streamlit_stage/strictyaml.zip'
232 |             )
233 | HANDLER='run_generation'
234 | EXECUTE AS CALLER
235 | AS $$
236 | from snowflake.snowpark import Session
237 | 
238 | def import_src_zip(zip_name = 'semantic_model_generator.zip'):
239 |     """Unpacks source zip file in stage to enable importing it to mirror source code structure."""
240 | 
241 |     import os
242 |     import sys
243 |     import zipfile
244 |     from filelock import FileLock
245 | 
246 |     # Get the location of the import directory. Snowflake sets the import
247 |     # directory location so code can retrieve the location via sys._xoptions.
248 |     IMPORT_DIRECTORY_NAME = "snowflake_import_directory"
249 |     import_dir = sys._xoptions[IMPORT_DIRECTORY_NAME]
250 | 
251 |     # Get the path to the ZIP file and set the location to extract to.
252 |     zip_file_path = import_dir + zip_name
253 |     extracted = f'/tmp/{zip_name.replace(".zip", "")}'
254 | 
255 |     # Extract the contents of the ZIP. This is done under the file lock
256 |     # to ensure that only one worker process unzips the contents.
257 |     with FileLock('/tmp/extract.lock'):
258 |         if not os.path.isdir(extracted):
259 |             with zipfile.ZipFile(zip_file_path, 'r') as myzip:
260 |                 myzip.extractall(extracted)
261 | 
262 |     # Add in front in case there are conflicting module names including original zipped file
263 |     sys.path.insert(0,extracted)
264 | 
265 | def run_generation(session: Session,
266 |                    STAGE_NAME: str,
267 |                    MODEL_NAME: str,
268 |                    SAMPLE_VALUE: int,
269 |                    ALLOW_JOINS: bool,
270 |                    TABLE_LIST: list[str]) -> str:
271 | 
272 |     import io
273 | 
274 |     import_src_zip()
275 |     from semantic_model_generator.generate_model import generate_model_str_from_snowflake
276 | 
277 |     if not MODEL_NAME:
278 |         raise ValueError("Please provide a name for your semantic model.")
279 |     elif not TABLE_LIST:
280 |         raise ValueError("Please select at least one table to proceed.")
281 |     else:
282 |         yaml_str = generate_model_str_from_snowflake(
283 |             base_tables=TABLE_LIST,
284 |             semantic_model_name=MODEL_NAME,
285 |             n_sample_values=SAMPLE_VALUE,  # type: ignore
286 |             conn=session.connection,
287 |             allow_joins=ALLOW_JOINS,
288 |         )
289 | 
290 |         session.file.put_stream(
291 |                 io.BytesIO(yaml_str.encode('utf-8')),
292 |                f"@{STAGE_NAME}/{MODEL_NAME}.yaml",
293 |                auto_compress=False,
294 |                overwrite=True,
295 |            )
296 |         return f"Semantic model file {MODEL_NAME}.yaml has been generated and saved to {STAGE_NAME}."
297 | $$;


--------------------------------------------------------------------------------